Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/large_scale_kv_save_delta

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/large_scale_kv_save_delta
11d17938 · seiriosPlus · 0075a7b0 · bc5f0246 · 11d17938 · 11d17938
314 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,8 +63,29 @@ if(WIN32)
        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+        foreach(flag_var
+            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+            if(${flag_var} MATCHES "/MD")
+                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+            endif()
+        endforeach(flag_var)
    endif()
-    
+
+    # windows build turn off warnings.
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+    endforeach(flag_var)
+    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+        set(${flag_var} "${${flag_var}} /w")
+    endforeach(flag_var)
+
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838 /MP")
    message(STATUS "Using parallel compiling (/MP)")

--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -22,23 +22,8 @@ SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)

 IF(WIN32)
  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/cryptopp-static.lib" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT")
-  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd")
-  set(CompilerFlags
-        CMAKE_CXX_FLAGS
-        CMAKE_CXX_FLAGS_DEBUG
-        CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_C_FLAGS
-        CMAKE_C_FLAGS_DEBUG
-        CMAKE_C_FLAGS_RELEASE
-        )
-  foreach(CompilerFlag ${CompilerFlags})
-    string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
-  endforeach()
 ELSE(WIN32)
  SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE)
-  SET(CRYPTOPP_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
 ENDIF(WIN32)

 set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
@@ -48,7 +33,7 @@ set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS}
                        -DCMAKE_INSTALL_LIBDIR=${CRYPTOPP_INSTALL_DIR}/lib
                        -DCMAKE_INSTALL_PREFIX=${CRYPTOPP_INSTALL_DIR}
                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DCMAKE_CXX_FLAGS=${CRYPTOPP_CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -90,20 +90,6 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()

-macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
-    if (BUILD_SHARED_LIBS) 
-        return() # if build shared libs, the flags keep same with '/MD'
-    endif(BUILD_SHARED_LIBS)
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-      if(${flag_var} MATCHES "/MD")
-        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-      endif(${flag_var} MATCHES "/MD")
-    endforeach(flag_var)
-endmacro()

 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -229,20 +215,3 @@ endforeach()

 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")

-
-if(WIN32)
-    # windows build turn off warnings.
-    if(MSVC_STATIC_CRT)
-        safe_set_static_flag()
-    endif()
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-endif()
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -13,18 +13,18 @@
 # limitations under the License.

 # make package for paddle fluid shared and static library
-set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
-  "A path setting fluid shared and static libraries")
+set(PADDLE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_install_dir" CACHE STRING
+  "A path setting paddle shared and static libraries")

-set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING
-  "A path setting fluid inference shared and static libraries")
+set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir" CACHE STRING
+  "A path setting paddle inference shared and static libraries")
  
 # TODO(zhaolong)
 # At present, the size of static lib in Windows exceeds the system limit,
 # so the generation of static lib is temporarily turned off.
 if(WIN32)
    #todo: remove the option 
-    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   OFF)
+    option(WITH_STATIC_LIB "Compile demo with static/shared library, default use dynamic."   OFF)
    if(NOT PYTHON_EXECUTABLE)
        FIND_PACKAGE(PythonInterp REQUIRED)
    endif()
@@ -142,14 +142,14 @@ set(inference_lib_deps third_party paddle_fluid paddle_fluid_c paddle_fluid_shar
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps})


-set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/threadpool")
+set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/threadpool")
 copy(inference_lib_dist
        SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
        DSTS ${dst_dir})

 # Only GPU need cudaErrorMessage.pb
 IF(WITH_GPU)
-        set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+        set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
        copy(inference_lib_dist
                SRCS ${cudaerror_INCLUDE_DIR}
                DSTS ${dst_dir})
@@ -158,65 +158,62 @@ ENDIF()
 # CMakeCache Info
 copy(inference_lib_dist
        SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INFERENCE_INSTALL_DIR})
+        DSTS ${PADDLE_INFERENCE_INSTALL_DIR})

-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_INSTALL_DIR})
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})

 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 if(WIN32)
    if(WITH_STATIC_LIB)
-        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib)
+        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.lib
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.*)
    else()
        set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.dll
-                            ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
+                             ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/paddle_fluid.lib)
    endif()
+    copy(inference_lib_dist
+            SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
+            DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib
+            ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 else(WIN32)
    set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*)
-endif(WIN32)
-
-if(WIN32 AND NOT WITH_STATIC_LIB)
-        copy(inference_lib_dist
-                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib
-                      ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
-else()
-        copy(inference_lib_dist
+    copy(inference_lib_dist
                SRCS  ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib}
-                DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib)
-endif()
+                DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+endif(WIN32)

 copy(inference_lib_dist
        SRCS  ${CMAKE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/internal)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/internal)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io/crypto/cipher.h
-        DSTS  ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/)
 include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io)

 # CAPI inference library for only inference
-set(FLUID_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_c_install_dir" CACHE STRING
-"A path setting CAPI fluid inference shared")
-copy_part_of_thrid_party(inference_lib_dist ${FLUID_INFERENCE_C_INSTALL_DIR})
+set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING
+"A path setting CAPI paddle inference shared")
+copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_C_INSTALL_DIR})

 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(paddle_fluid_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_fluid_c.*)

 copy(inference_lib_dist
      SRCS  ${src_dir}/inference/capi/paddle_c_api.h  ${paddle_fluid_c_lib}
-      DSTS  ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_C_INSTALL_DIR}/paddle/lib)
+      DSTS  ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib)

 # fluid library for both train and inference
 set(fluid_lib_deps inference_lib_dist)
 add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps})

-set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
+set(dst_dir "${PADDLE_INSTALL_DIR}/paddle/fluid")
 set(module "inference")
-if(WIN32 AND NOT WITH_STATIC_LIB)
+if(WIN32)
        copy(fluid_lib_dist
                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
                )
-else()
+        else()
        copy(fluid_lib_dist
                SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib}
                DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} 
@@ -273,22 +270,22 @@ copy(fluid_lib_dist
        DSTS ${dst_dir}/${module}
        )

-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/eigen3")
 copy(inference_lib_dist
        SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
        DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported)

-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/boost")
 copy(inference_lib_dist
        SRCS ${BOOST_INCLUDE_DIR}/boost
        DSTS ${dst_dir})

-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/dlpack")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/dlpack")
 copy(inference_lib_dist
        SRCS ${DLPACK_INCLUDE_DIR}/dlpack
        DSTS ${dst_dir})

-set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
+set(dst_dir "${PADDLE_INSTALL_DIR}/third_party/install/zlib")
 copy(inference_lib_dist
        SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
        DSTS ${dst_dir} ${dst_dir}/lib)
@@ -296,8 +293,8 @@ copy(inference_lib_dist

 # CMakeCache Info
 copy(fluid_lib_dist
-        SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-        DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR}
+        SRCS ${PADDLE_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
+        DSTS ${PADDLE_INSTALL_DIR} ${PADDLE_INSTALL_DIR}
        )

 # paddle fluid version
@@ -323,6 +320,6 @@ function(version version_file)
    endif()
    
 endfunction()
-version(${FLUID_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt)
-version(${FLUID_INFERENCE_C_INSTALL_DIR}/version.txt)
+version(${PADDLE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_INSTALL_DIR}/version.txt)
+version(${PADDLE_INFERENCE_C_INSTALL_DIR}/version.txt)
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,8 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op")
+"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"fused_bn_add_activation_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()

--- a/go/README_cn.md
+++ b/go/README_cn.md
 # Paddle 预测golang API

 ## 安装
-首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``fluid_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`
+首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c`

 ## 在Go中使用Paddle预测
 首先创建预测配置

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto
    eager_deletion_pass
    buffer_shared_inplace_op_pass
    buffer_shared_cross_op_memory_reuse_pass
+    inplace_addto_op_pass
    set_reader_device_info_utils
    add_reader_dependency_pass)
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})

--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -12,7 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+
 #include <algorithm>
+
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
@@ -34,14 +36,24 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<platform::Place> &places,
                                     const platform::NCCLCommunicator *ctxs)
    : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #else
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                     const std::vector<Scope *> &local_scopes,
                                     const std::vector<platform::Place> &places)
    : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
 }
 #endif

@@ -60,13 +72,25 @@ void AllReduceOpHandle::AllReduceImpl(
    const std::vector<VarHandle *> &in_var_handles,
    const std::vector<VarHandle *> &out_var_handles) {
  size_t num_places = places_.size();
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), num_places,
-      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(in_var_handles.size(), num_places,
+                    platform::errors::InvalidArgument(
+                        "The NoDummyInputSize should be equal "
+                        "to the number of places, but got NoDummyInputSize is "
+                        "%d and the number of place is %d.",
+                        in_var_handles.size(), num_places));
  PADDLE_ENFORCE_EQ(
      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-  PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places);
+      platform::errors::InvalidArgument(
+          "The NoDummyInputSize and NoDummyOutputSize should be "
+          "equal, but got NoDummyInputSize is %d and NoDummyOutputSize is %d.",
+          in_var_handles.size(), out_var_handles.size()));
+  PADDLE_ENFORCE_EQ(
+      local_exec_scopes_.size(), num_places,
+      platform::errors::InvalidArgument(
+          "The number of local scopes should be equal "
+          "to the number of places, but got the number of local scopes is "
+          "%d and the number of place is %d.",
+          in_var_handles.size(), num_places));

  std::vector<const void *> lod_tensor_data;
  std::vector<platform::Place> places;
@@ -78,23 +102,36 @@ void AllReduceOpHandle::AllReduceImpl(
  for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
    auto &local_scope = local_exec_scopes_[i];
    auto var = local_scope->FindVar(in_var_handles[i]->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.",
-                            in_var_handles[i]->name());
+    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::NotFound(
+                                     "Variable %s is not found in local scope.",
+                                     in_var_handles[i]->name()));
    auto &lod_tensor = var->Get<LoDTensor>();

    if (i == 0) {
      numel = static_cast<int64_t>(lod_tensor.numel());
      // only enforce place0, we will enforce other palce numel == place0 numel
      PADDLE_ENFORCE_GT(
-          numel, 0, platform::errors::InvalidArgument(
-                        "The numel of tensos=[%s] must > 0. But now numel=[%d]",
-                        in_var_handles[i]->name(), numel));
+          numel, 0,
+          platform::errors::PreconditionNotMet(
+              "The numel of tensor %s should be > 0, but got numel is %d.",
+              in_var_handles[i]->name(), numel));
      dtype = lod_tensor.type();
      is_gpu_place = platform::is_gpu_place(lod_tensor.place());
    }
-    PADDLE_ENFORCE_EQ(numel, static_cast<int64_t>(lod_tensor.numel()));
-    PADDLE_ENFORCE_EQ(dtype, lod_tensor.type());
-    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()));
+    PADDLE_ENFORCE_EQ(
+        numel, static_cast<int64_t>(lod_tensor.numel()),
+        platform::errors::PreconditionNotMet(
+            "The size of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(
+        dtype, lod_tensor.type(),
+        platform::errors::PreconditionNotMet(
+            "The dtype of tensors of the same variable in different local "
+            "scopes should be equal."));
+    PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of tensors of the same variable "
+                          "in different local scopes should be equal."));

    lod_tensor_data.emplace_back(lod_tensor.data<void>());
    places.emplace_back(lod_tensor.place());
@@ -102,8 +139,12 @@ void AllReduceOpHandle::AllReduceImpl(
    VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name()
             << ", out_name:" << out_var_handles[i]->name();

-    PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(),
-                      "The name of input and output should be equal.");
+    PADDLE_ENFORCE_EQ(
+        in_var_handles[i]->name(), out_var_handles[i]->name(),
+        platform::errors::InvalidArgument(
+            "The name of input and output of all_reduce op should be equal, "
+            "but got input is %s and output is %s.",
+            in_var_handles[i]->name(), out_var_handles[i]->name()));
  }

  std::vector<std::string> grad_var_names;
@@ -122,7 +163,9 @@ void AllReduceOpHandle::AllReduceFunc(
    const std::vector<std::string> &out_var_names) {
  if (is_gpu_place(places[0])) {
 #if defined(PADDLE_WITH_NCCL)
-    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+    PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
+                            platform::errors::InvalidArgument(
+                                "The nccl context should not be NULL."));
    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
    std::vector<std::function<void()>> all_reduce_calls;
    for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
@@ -134,7 +177,8 @@ void AllReduceOpHandle::AllReduceFunc(
    }
    NCCLAllReduceFunc(all_reduce_calls);
 #else
-    PADDLE_THROW("Not compiled with CUDA.");
+    PADDLE_THROW(
+        platform::errors::PreconditionNotMet("Not compiled with CUDA."));
 #endif
  } else {  // Special handle CPU only Operator's gradient. Like CRF
    auto &trg = *local_exec_scopes_[0]

--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -89,8 +89,19 @@ AsyncSSAGraphExecutor::AsyncSSAGraphExecutor(
      places_(std::move(places)),
      graphs_(std::move(graphs)) {
  VLOG(3) << "build AsyncSSAGraphExecutor";
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));

  // set the correct size of thread pool to each device.
  strategy_.num_threads_ = strategy_.num_threads_ < places_.size()

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "boost/optional.hpp"
 #include "paddle/fluid/framework/ir/pass_builder.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -119,6 +120,9 @@ struct BuildStrategy {
  // Turn on inplace by default.
  bool enable_inplace_{true};

+  // Turn off inplace addto by default.
+  bool enable_addto_{false};
+
  // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode,
  // num_trainers is 1, so the current fields of build_strategy doesn't tell if
  // it's distributed model.

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+
 #include <deque>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -48,7 +50,9 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
      bootstrap_ops_.emplace_back(op);
    }
  }
-  PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(op_deps_.size(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "The graph doesn't have operators."));
  PrepareAtomicOpDeps();
 }


--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -13,9 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/platform/profiler.h"

 namespace paddle {
@@ -138,8 +140,10 @@ void FetchOpHandle::RunImpl() {
    auto *var_handle = static_cast<VarHandle *>(inputs_[i]);
    auto &scope = scopes.at(var_handle->scope_idx());
    auto *var = scope->FindVar(var_handle->name());
-    PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
-                            var_handle->name());
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::NotFound(
+            "Cannot find variable %s in execution scope.", var_handle->name()));

    if (var->IsType<LoDTensor>()) {
      auto &t = var->Get<framework::LoDTensor>();

--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/op_handle_base.h"
+
 #include <map>
 #include <unordered_set>

@@ -88,6 +89,12 @@ void OpHandleBase::Run(bool use_cuda) {
  PADDLE_ENFORCE(!use_cuda);
 #endif

+  // skip running current op, used with inplace_addto_op_pass
+  if (skip_running_) {
+    VLOG(4) << "skip running: " << Name();
+    return;
+  }
+
  RunImpl();
 }


--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/var_handle.h"
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -52,6 +53,10 @@ class OpHandleBase {

  virtual Priority GetPriority() const { return kNormal; }

+  virtual bool GetSkipRunning() const { return skip_running_; }
+
+  virtual void SetSkipRunning(bool skip_runing) { skip_running_ = skip_runing; }
+
  virtual std::string Name() const = 0;

  void Run(bool use_cuda);
@@ -131,6 +136,7 @@ class OpHandleBase {
  std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;

  std::vector<Scope *> local_exec_scopes_;
+  bool skip_running_ = false;

 #ifdef PADDLE_WITH_CUDA
  std::unordered_map<int, cudaEvent_t> events_;

--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <utility>
+
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
@@ -104,7 +106,12 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
      places_(places),
      graphs_(std::move(graphs)),
      feed_status_(places.size(), FeedStatus::kNone) {
-  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
+  PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "The number of places and the number of local scopes "
+                        "should be equal, but got number of places is %d and "
+                        "number of local scopes is %d.",
+                        places_.size(), local_scopes_.size()));

  PADDLE_ENFORCE_EQ(places_.size(), graphs_.size(),
                    platform::errors::InvalidArgument(

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -13,10 +13,12 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+
 #include <stdexcept>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -37,7 +39,13 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor(
      var_infos_(std::move(var_infos)),
      places_(std::move(places)),
      scope_monitor_(places_, local_exec_scopes_) {
-  PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size());
+  PADDLE_ENFORCE_EQ(
+      local_scopes_.size(), local_exec_scopes_.size(),
+      platform::errors::InvalidArgument(
+          "The number of local scopes and the number of local execution scopes "
+          "should be equal, but got number of local scopes is %d and "
+          "number of local execution scopes is %d.",
+          local_scopes_.size(), local_exec_scopes_.size()));
  PrepareLocalExeScopes();
 }


--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc
@@ -13,9 +13,11 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
+
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,7 +31,8 @@ static inline const Tensor &GetTensorFromVar(const Variable *var) {
  if (var->IsType<LoDTensor>()) {
    return var->Get<LoDTensor>();
  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
  }
 }

@@ -37,20 +40,27 @@ static inline Tensor *GetMutableTensorFromVar(Variable *var) {
  if (var->IsType<LoDTensor>()) {
    return var->GetMutable<LoDTensor>();
  } else {
-    PADDLE_THROW("Variable must be type of LoDTensor");
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Variable must be type of LoDTensor."));
  }
 }

 ShareTensorBufferFunctor::ShareTensorBufferFunctor(
    Scope *scope, size_t scope_idx, const std::string &op_type,
    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
    : scope_(scope),
      scope_idx_(scope_idx),
      op_type_(op_type),
      in_var_infos_(in_var_infos),
-      out_var_names_(out_var_names) {
-  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size());
+      out_var_names_(out_var_names),
+      share_dims_(share_dims) {
+  PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size(),
+                    platform::errors::PreconditionNotMet(
+                        "The number of input variables and output variables "
+                        "should be equal, but got number of input variables is "
+                        "%d and number of output variables is %d.",
+                        in_var_infos_.size(), out_var_names_.size()));
  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
    AddReuseVarPair(in_var_infos_[i], out_var_names_[i]);
  }
@@ -67,32 +77,59 @@ ShareTensorBufferFunctor::ReusedVars() const {

 void ShareTensorBufferFunctor::AddReuseVarPair(
    const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) {
-  PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr");
+  PADDLE_ENFORCE_NOT_NULL(
+      in_var_info,
+      platform::errors::InvalidArgument(
+          "The input variables to be inplaced should not be NULL."));
  PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name,
-                    "in/out cannot have same name: %s", out_var_name);
+                    platform::errors::InvalidArgument(
+                        "The input variable and output variable to be inplaced "
+                        "cannot have the same name: %s.",
+                        out_var_name));
  in_var_infos_.emplace_back(in_var_info);
  out_var_names_.emplace_back(out_var_name);
 }

 void ShareTensorBufferFunctor::CallOnce() {
-  PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here");
+  PADDLE_ENFORCE(in_out_vars_.empty(),
+                 platform::errors::InvalidArgument(
+                     "The input-output variable pairs to be "
+                     "inplaced should be initialized here."));
  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
    auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name());
    auto *out_var = exec_scope_->FindVar(out_var_names_[i]);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    PADDLE_ENFORCE_NOT_NULL(out_var);
-    PADDLE_ENFORCE_NE(in_var, out_var);
+    PADDLE_ENFORCE_NOT_NULL(
+        in_var, platform::errors::NotFound(
+                    "The input variable(%s)to be inplaced should not be NULL.",
+                    in_var_infos_[i]->Name()));
+    PADDLE_ENFORCE_NOT_NULL(
+        out_var,
+        platform::errors::NotFound(
+            "The output variable(%s) to be inplaced should not be NULL.",
+            out_var_names_[i]));
+    PADDLE_ENFORCE_NE(
+        in_var, out_var,
+        platform::errors::PreconditionNotMet(
+            "The input variable and output variable to be inplaced "
+            "cannot be the same variable(%s).",
+            out_var_names_[i]));
    in_out_vars_.emplace_back(in_var, out_var);
  }
 }

 void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
  if (!exec_scope_) {
-    PADDLE_ENFORCE_NOT_NULL(exec_scope);
+    PADDLE_ENFORCE_NOT_NULL(exec_scope,
+                            platform::errors::InvalidArgument(
+                                "The given execution scope should not be NULL "
+                                "if the cached scope is NULL."));
    exec_scope_ = exec_scope;
    CallOnce();
  } else {
-    PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same");
+    PADDLE_ENFORCE_EQ(exec_scope_, exec_scope,
+                      platform::errors::InvalidArgument(
+                          "The given execution scope and the cached execution "
+                          "scope should be the same."));
  }

  for (size_t i = 0; i < in_var_infos_.size(); ++i) {
@@ -115,6 +152,13 @@ void ShareTensorBufferFunctor::operator()(Scope *exec_scope) {
    } else {
      out_tensor->ShareBufferWith(in_tensor);

+      // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+      // the in_out_vars is skipped during running, we should set the dims of
+      // output as the same as input.
+      if (share_dims_) {
+        out_tensor->Resize(in_tensor.dims());
+      }
+
      VLOG(2) << "Share tensor buffer when running " << op_type_ << " : "
              << in_var_info->Name() << " -> " << out_var_names_[i];
    }

--- a/paddle/fluid/framework/details/share_tensor_buffer_functor.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h
@@ -19,6 +19,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/scope.h"
@@ -40,11 +41,13 @@ class ShareTensorBufferFunctor {
  ShareTensorBufferFunctor(
      Scope *scope, size_t scope_idx, const std::string &op_type,
      const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);

  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                       const std::string &out_var_name);

+  void SetShareDims(bool share_dims) { share_dims_ = share_dims; }
+
  void operator()(Scope *exec_scope);

  std::unordered_map<std::string, std::string> ReusedVars() const;
@@ -66,6 +69,11 @@ class ShareTensorBufferFunctor {
  std::vector<std::string> out_var_names_;

  std::vector<std::pair<const Variable *, Variable *>> in_out_vars_;
+
+  // NOTE(zhiqiu): In the case of inplace addto, if the operator of
+  // the in_out_vars is skipped during running, we should set the dims of output
+  // as the same as input.
+  bool share_dims_{false};
 };

 }  // namespace details

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -13,8 +13,10 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+
 #include <string>
 #include <unordered_set>
+
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
@@ -32,26 +34,35 @@ ComputationOpHandle *GetUniquePendingComputationOpHandle(
    for (ir::Node *pending_op : out_var->outputs) {
      auto &op = pending_op->Wrapper<OpHandleBase>();
      auto *compute_op = dynamic_cast<ComputationOpHandle *>(&op);
-      PADDLE_ENFORCE_NOT_NULL(compute_op);
+      PADDLE_ENFORCE_NOT_NULL(
+          compute_op,
+          platform::errors::PreconditionNotMet(
+              "The pending OpHandle should be ComputationOpHandle."));

      if (result_op == nullptr) {
        result_op = compute_op;
      } else {
-        PADDLE_ENFORCE_EQ(result_op, compute_op);
+        PADDLE_ENFORCE_EQ(
+            result_op, compute_op,
+            platform::errors::PreconditionNotMet(
+                "The pending OpHandle should be the unique one."));
      }
    }
  }

-  PADDLE_ENFORCE_NOT_NULL(result_op);
+  PADDLE_ENFORCE_NOT_NULL(result_op,
+                          platform::errors::PreconditionNotMet(
+                              "The pending OpHandle should not be NULL."));
  return result_op;
 }

 ShareTensorBufferOpHandle::ShareTensorBufferOpHandle(
    ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type,
    const std::vector<const ir::MemOptVarInfo *> &in_var_infos,
-    const std::vector<std::string> &out_var_names)
+    const std::vector<std::string> &out_var_names, bool share_dims)
    : OpHandleBase(node),
-      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {}
+      functor_(scope, scope_idx, op_type, in_var_infos, out_var_names,
+               share_dims) {}

 std::unordered_map<std::string, std::string>
 ShareTensorBufferOpHandle::ReusedVars() const {
@@ -63,6 +74,10 @@ void ShareTensorBufferOpHandle::AddReuseVarPair(
  functor_.AddReuseVarPair(in_var_info, out_var_name);
 }

+void ShareTensorBufferOpHandle::SetShareDims(bool share_dims) {
+  functor_.SetShareDims(share_dims);
+}
+
 void ShareTensorBufferOpHandle::InitCUDA() {
 #ifdef PADDLE_WITH_CUDA
  int dev_id =

--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h
@@ -17,6 +17,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
@@ -31,7 +32,7 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
      ir::Node *node, Scope *scope, size_t scope_idx,
      const std::string &op_type,
      const std::vector<const ir::MemOptVarInfo *> &in_vars_infos,
-      const std::vector<std::string> &out_var_names);
+      const std::vector<std::string> &out_var_names, bool share_dims = false);

  std::unordered_map<std::string, std::string> ReusedVars() const;

@@ -42,6 +43,8 @@ class ShareTensorBufferOpHandle : public OpHandleBase {
  void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info,
                       const std::string &out_var_name);

+  void SetShareDims(bool share_dims);
+
  const ShareTensorBufferFunctor &Functor() const { return functor_; }

 protected:

--- a/paddle/fluid/framework/details/ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"

 namespace paddle {
@@ -27,8 +28,9 @@ void ClearFetchOp(ir::Graph* graph, std::vector<OpHandleBase*>* fetch_ops) {
    PADDLE_ENFORCE_EQ(dynamic_cast<FetchOpHandle*>(op) != nullptr ||
                          dynamic_cast<FetchAsyncOpHandle*>(op) != nullptr,
                      true,
-                      "The input ops of ClearFetchOp function should be "
-                      "FetchOpHandle or FetchAsyncOpHandle.");
+                      platform::errors::PreconditionNotMet(
+                          "The input ops of ClearFetchOp function should be "
+                          "FetchOpHandle or FetchAsyncOpHandle."));
    for (auto& out_var : op->Node()->outputs) {
      graph->RemoveNode(out_var);
    }

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
+
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -138,7 +139,10 @@ inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
        }
      }
    }
-    PADDLE_ENFORCE(ready_ops.empty());
+    PADDLE_ENFORCE_EQ(
+        ready_ops.empty(), true,
+        platform::errors::Fatal("After the execution of computation graph, "
+                                "there are unexecuted operators left."));
  }

  // Wait FetchOps.
@@ -165,9 +169,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
    FetchResultType *fetch_data, bool return_merged) {
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
  std::unordered_set<VarHandleBase *> local_ready_vars;
-  std::unordered_set<std::string> fetch_tensor_set(fetch_tensors.begin(),
-                                                   fetch_tensors.end());
-  for (auto &fetch_var_name : fetch_tensor_set) {
+
+  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>(details::kGraphVars)) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
@@ -231,7 +234,11 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
      ready_ops->insert(static_cast<OpHandleBase *>(op));
    }
  }
-  PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0);
+  PADDLE_ENFORCE_EQ(
+      local_ready_vars.size(), 0,
+      platform::errors::Fatal(
+          "The number of ready variables should be 0, but got %d.",
+          local_ready_vars.size()));
 }

 void ThreadedSSAGraphExecutor::InsertPendingOp(
@@ -277,7 +284,9 @@ void ThreadedSSAGraphExecutor::PrepareOpDeps() {
    }
  }
  op_deps_->num_ops_ = ready_ops.size() + pending_ops.size();
-  PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators.");
+  PADDLE_ENFORCE_GT(
+      op_deps_->num_ops_, 0,
+      platform::errors::InvalidArgument("The graph doesn't have operators."));

  for (auto ready_var : ready_vars) {
    pending_vars.erase(ready_var);

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -14,6 +14,8 @@

 #pragma once

+#include <ThreadPool.h>  // ThreadPool in thrird party
+
 #include <deque>
 #include <functional>
 #include <list>
@@ -24,8 +26,6 @@
 #include <utility>
 #include <vector>

-#include <ThreadPool.h>  // ThreadPool in thrird party
-
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"

--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -54,8 +54,10 @@ struct VarHandleBase {

  void AddOutput(OpHandleBase* out, ir::Node* node) {
    if (pending_ops_.find(out) == pending_ops_.end()) {
-      PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr",
-                     this->Node()->Name());
+      PADDLE_ENFORCE_NOT_NULL(out,
+                              platform::errors::InvalidArgument(
+                                  "The output added to VarHandle %s is NULL.",
+                                  this->Node()->Name()));
      pending_ops_.insert(out);
      node_->outputs.push_back(node);
    }
@@ -120,7 +122,10 @@ struct VarHandle : public VarHandleBase {
  bool HasEvent() { return has_event_; }

  const cudaEvent_t& GetEvent() {
-    PADDLE_ENFORCE(HasEvent(), "The event is not set.");
+    PADDLE_ENFORCE_EQ(
+        HasEvent(), true,
+        platform::errors::PreconditionNotMet(
+            "The cuda event is not set, maybe InitCUDA() is not called."));
    return event_;
  }


--- a/paddle/fluid/framework/details/variable_visitor.cc
+++ b/paddle/fluid/framework/details/variable_visitor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/variable_visitor.h"
+
 #include "paddle/fluid/framework/selected_rows.h"
 namespace paddle {
 namespace framework {
@@ -24,7 +25,9 @@ static void VisitVariable(Variable* var, Func* func) {
  } else if (var->IsType<SelectedRows>()) {
    (*func)(var->GetMutable<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var->Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.",
+        ToTypeName(var->Type())));
  }
 }

@@ -35,7 +38,8 @@ static void VisitVariable(const Variable& var, Func* func) {
  } else if (var.IsType<SelectedRows>()) {
    (*func)(var.Get<SelectedRows>());
  } else {
-    PADDLE_THROW("Not supported type %s", ToTypeName(var.Type()));
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "VisitVariable is not supported for type %s.", ToTypeName(var.Type())));
  }
 }

@@ -50,7 +54,8 @@ struct TensorVisitor {

  template <typename T>
  void operator()() {
-    PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Getting tensor from type %s is not supported.", typeid(T).name()));
  }
 };

@@ -78,8 +83,8 @@ struct ShareDimsAndLoDVisitor {

  template <typename T>
  void operator()(const T&) {
-    PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "ShareDimsAndLoD is not supported for type %s.", typeid(T).name()));
  }
 };

@@ -89,42 +94,54 @@ void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) {
 }

 struct EnforceShapeAndDTypeEQVisitor {
-  const Variable* trg_;
+  const Variable* dst_;

  void operator()(const LoDTensor& src) {
-    auto& tensor = trg_->Get<LoDTensor>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), tensor.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& tensor = dst_->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(src.place().which(), tensor.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
    PADDLE_ENFORCE_EQ(src.type(), tensor.type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.dims(), tensor.dims(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
    PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(),
-                      "The lod of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The lod of the two variable is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.layout(), tensor.layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors tensor is not equal."));
  }

  void operator()(const SelectedRows& src) {
-    auto& selected_rows = trg_->Get<SelectedRows>();
-    PADDLE_ENFORCE_EQ(
-        src.place().which(), selected_rows.place().which(),
-        "The Places of the two Variable must be all on CPU or all on GPU.");
+    auto& selected_rows = dst_->Get<SelectedRows>();
+    PADDLE_ENFORCE_EQ(src.place().which(), selected_rows.place().which(),
+                      platform::errors::PreconditionNotMet(
+                          "The place type of the two variables is not equal."));
    PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(),
-                      "The dtype of the two Variable is not equal.");
-    PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(),
-                      "The layout of the two Variable's tensor is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dtype of the two variables is not equal."));
+    PADDLE_ENFORCE_EQ(
+        src.value().layout(), selected_rows.value().layout(),
+        platform::errors::PreconditionNotMet(
+            "The layout of the two variables' tensors is not equal."));
    PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(),
-                      "The height of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The height of the two variables is not equal."));
    PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(),
-                      "The dims of the two Variable is not equal.");
+                      platform::errors::PreconditionNotMet(
+                          "The dims of the two variables is not equal."));
  }

  template <typename T>
  void operator()(const T&) {
-    PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s",
-                   typeid(T).name());
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "EnforceShapeAndDTypeEQ is not supported for type %s.",
+        typeid(T).name()));
  }
 };


--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -23,6 +23,8 @@
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/platform/cpu_info.h"

+#include "paddle/fluid/framework/op_version_registry.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -34,7 +36,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,

  // Build pattern
  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
-                  ->assert_is_op_input("lookup_table")
+                  ->assert_is_op_input("lookup_table_v2")
                  ->assert_var_not_persistable();
  patterns::Embedding embedding_pattern(pattern, name_scope);
  // TODO(jczaja): Intermediate can only be for val that are not used anywhere
@@ -256,3 +258,11 @@ void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const {

 REGISTER_PASS(embedding_fc_lstm_fuse_pass,
              paddle::framework::ir::EmbeddingFCLSTMFusePass);
+REGISTER_PASS_CAPABILITY(embedding_fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("lookup_table_v2", 0)
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fused_embedding_fc_lstm", 0));
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -18,6 +18,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -182,3 +183,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {

 REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass)
    .RequirePassAttr("use_gpu");
+REGISTER_PASS_CAPABILITY(fc_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0)
+            .EQ("fc", 0));
--- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -125,7 +126,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
    auto* x_n = subgraph.at(x);
    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern);
@@ -136,10 +136,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
                              gru_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchHidden, gru_pattern);

+    // TODO(wilber): Support origin_mode=True.
+    if (gru->Op()->GetAttrIfExists<bool>("origin_mode") == true) {
+      LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True.";
+      return;
+    }
+
    if (with_fc_bias) {
      GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern);
      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern);

      gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias);
      // Remove unneeded nodes.
@@ -188,3 +195,16 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const {

 REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass);
 REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass);
+REGISTER_PASS_CAPABILITY(mul_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
+REGISTER_PASS_CAPABILITY(fc_gru_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("gru", 0)
+            .EQ("fusion_gru", 0));
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -196,3 +197,17 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const {

 REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass);
 REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass);
+
+REGISTER_PASS_CAPABILITY(fc_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
+REGISTER_PASS_CAPABILITY(mul_lstm_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("mul", 0)
+            .EQ("lstm", 0)
+            .EQ("fusion_lstm", 0));
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -13,4 +13,6 @@ cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handl
 cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass)
 cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) 

+cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
+
 cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
--- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc
@@ -16,6 +16,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -141,11 +142,12 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const {
        VLOG(4) << "Inplace performed in op " << op_type << ": "
                << in_var_handle_ptr->Name() << " -> "
                << out_var_handle_ptr->Name()
-                << ". Debug String is: " << op->GetOp()->DebugString();
+                << ". Debug String is: " << op->GetOp()->DebugString()
+                << ". ReuseType: " << ReuseType();
      } else {
        VLOG(3) << "Inplace failed in op " << op_type << ": "
                << in_var_handle_ptr->Name() << " -> "
-                << out_var_handle_ptr->Name();
+                << out_var_handle_ptr->Name() << ". ReuseType: " << ReuseType();
      }
    }
  }

--- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class InplaceAddToOpPass : public MemoryReusePass {
+ protected:
+  std::string ReuseType() const override { return "inplace_addto"; }
+
+  void Run(Graph *graph) const override;
+
+ private:
+  // 1. Add last living op of in_var, add any last living op of out_var
+  // 2. Set reference count of in_var to be 2
+  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                             details::VarHandle *in_var,
+                             details::VarHandle *out_var) const override {
+    size_t scope_idx = op->GetScopeIdx();
+    auto *last_live_ops_of_vars_ =
+        &Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+    auto *var_infos_ = &(Get<MemOptVarInfoMapList>(kMemOptVarInfoMapList));
+    auto out_var_op_iter =
+        (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name());
+
+    // In Reduce mode, some output variable(gradient of parameter) does not have
+    // last live ops
+    details::ComputationOpHandle *last_live_op_of_in_var = nullptr;
+    if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) {
+      last_live_op_of_in_var = op;
+    } else {
+      PADDLE_ENFORCE_EQ(
+          out_var_op_iter->second.ops().empty(), false,
+          platform::errors::InvalidArgument(
+              "Var(%s)'s last live op should not empty.", out_var->Name()));
+      last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin());
+    }
+
+    auto *last_live_ops_of_in_var =
+        (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops();
+    // last_live_ops_of_in_var->clear();
+    last_live_ops_of_in_var->insert(last_live_op_of_in_var);
+
+    auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name());
+    PADDLE_ENFORCE_NE(
+        in_var_info_iter, (*var_infos_)[scope_idx].end(),
+        platform::errors::NotFound("Cannot find variable %s.", in_var->Name()));
+
+    in_var_info_iter->second->SetRefCnt(2);  // before inplace, it is 1
+  }
+};
+
+void InplaceAddToOpPass::Run(Graph *graph) const {
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+
+  bool use_cuda = Get<bool>(kUseCuda);
+
+  // Currently, only perform InplaceAddToOpPass on cuda place
+  if (!use_cuda) {
+    return;
+  }
+
+  // Step 1: Build a reverse map of last_live_ops
+  // i.e.: op -> vars
+  std::unordered_map<details::ComputationOpHandle *,
+                     std::unordered_map<std::string, ir::Node *>>
+      candidate_ops;
+  for (auto &each_scope_ops : last_live_ops) {
+    for (auto &pair : each_scope_ops) {
+      // If variable has more than 1 last lived ops, this variable cannot
+      // be inplaced.
+      if (pair.second.ops().size() != 1) {
+        continue;
+      }
+
+      auto *op = *(pair.second.ops().begin());
+      const std::string &op_type = op->GetOp()->Type();
+      const framework::OpDesc *op_desc = op->Node()->Op();
+      PADDLE_ENFORCE_NOT_NULL(
+          op_desc, platform::errors::NotFound("Op(%s) can not find opdesc.",
+                                              op->Name()));
+
+      // only grad op should be processed.
+      if (op_type != "grad_add") {
+        continue;
+      }
+
+      const std::string &var_name = pair.first;
+      auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs);
+      if (in_nodes.size() == 1) {
+        candidate_ops[op][var_name] = *in_nodes.begin();
+      }
+      VLOG(4) << "Find op " << op_type << " with input(" << var_name
+              << ") that can do inplace add to";
+    }
+  }
+
+  // Step 2: Check which vars can be inplaced indeed
+  for (auto &op_vars_pair : candidate_ops) {
+    auto *op = op_vars_pair.first;
+
+    // The original gradient accumulation is g = sum(g_0, g_1,..., g_n), and it
+    // could be changed as follws if inplace addto is enabled:
+    // g_sum_0 = g_0
+    // g_sum_1 = grad_add(g_sum_0, g_1)
+    // g_sum_2 = grad_add(g_sum_1, g_2)
+    // ...
+    // g_sum_n = grad_add(g_sum_n-1, g_n)
+
+    // here we will add inplace for each grad_add, for example, for the first
+    // grad_add, g_sum_0 -> g1, g_sum_1 -> g1, and set grad_add as skipped.
+
+    const std::string &op_type = op->GetOp()->Type();
+
+    PADDLE_ENFORCE_EQ(op->Node()->inputs.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The size of inputs of %s should be 2, but got %d",
+                          op_type, op->Node()->inputs.size()));
+
+    PADDLE_ENFORCE_EQ(op->Node()->outputs.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of outputs of %s should be 1, but got %d",
+                          op_type, op->Node()->outputs.size()));
+
+    auto *left_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[0]->Wrapper<details::VarHandleBase>()));
+    auto *right_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->inputs[1]->Wrapper<details::VarHandleBase>()));
+    auto *out_var_ptr = dynamic_cast<details::VarHandle *>(
+        &(op->Node()->outputs[0]->Wrapper<details::VarHandleBase>()));
+
+    if (left_var_ptr == nullptr || right_var_ptr == nullptr ||
+        out_var_ptr == nullptr) {
+      continue;
+    }
+
+    // auto *left_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+    //     left_var_ptr->GeneratedOp());
+
+    auto *right_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        right_var_ptr->GeneratedOp());
+
+    auto *out_generated_op = dynamic_cast<details::ComputationOpHandle *>(
+        out_var_ptr->GeneratedOp());
+
+    // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy
+    if (right_generated_op->Name() != "conv2d_grad") {
+      continue;
+    }
+
+    // NOTE(zhiqiu): Normally, if we inplace a->b, we should let a generated
+    // before b. However, in the situation of inplace addto, we do not care
+    // the order, since a+b is equal to b+a. Is there any exception for that?
+
+    // AddDependencyVar(right_generated_op, left_generated_op);
+    // no need, as discussed above.
+
+    // step (a): inplace right_var->left_var of grad_add
+
+    this->AddReuseVar(right_generated_op, left_var_ptr, right_var_ptr);
+    UpdateLastLiveOpOfVar(right_generated_op, left_var_ptr, right_var_ptr);
+    VLOG(4) << "Inplace performed in op " << right_generated_op->GetOp()->Type()
+            << ": " << left_var_ptr->Name() << " -> " << right_var_ptr->Name()
+            << ". Debug String is: "
+            << right_generated_op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (b): inplace out -> right_var of grad_add
+
+    this->AddReuseVar(out_generated_op, right_var_ptr, out_var_ptr, true);
+
+    VLOG(4) << "Inplace performed in op " << op_type << ": "
+            << left_var_ptr->Name() << " -> " << out_var_ptr->Name()
+            << ". Debug String is: " << op->GetOp()->DebugString()
+            << ". ReuseType: " << ReuseType();
+
+    // step (c): make right_var cannot inplace afterwards. canbe done
+    // aotomatically since CollectReusedVars is called before any reuse.
+
+    // step (d): make right_var's generated op use addto
+    right_generated_op->GetOp()->SetAttr("use_addto", true);
+
+    // step (e): make grad_add skip running
+    op->SetSkipRunning(true);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(inplace_addto_op_pass, paddle::framework::ir::InplaceAddToOpPass)
+    .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList)
+    .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::ir::kUseCuda);
--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h"
+
 #include <functional>
 #include <map>
 #include <string>
@@ -73,6 +74,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var,
          out_var->Name()));
  if (IsVarPairReusable(*in_var, *out_var)) {
    AddReuseVar(op, in_var, out_var);
+    UpdateLastLiveOpOfVar(op, in_var, out_var);
    return true;
  } else {
    return false;
@@ -324,7 +326,8 @@ bool MemoryReusePass::IsVarPairReusable(

 void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
                                  details::VarHandle *in_var,
-                                  details::VarHandle *out_var) const {
+                                  details::VarHandle *out_var,
+                                  bool share_dims) const {
  PADDLE_ENFORCE_GT(
      (*var_infos_)[op->GetScopeIdx()].count(in_var->Name()), 0,
      platform::errors::NotFound("Var(%s) does not in mem opt var infos.",
@@ -344,13 +347,15 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op,
    share_buffer_op->AddInput(in_var);
  }

+  if (share_dims) {
+    share_buffer_op->SetShareDims(true);
+  }
+
  share_buffer_op->AddReuseVarPair(
      (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(),
      out_var->Name());
  reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name());
  reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name());
-
-  UpdateLastLiveOpOfVar(op, in_var, out_var);
 }

 // 1. Set last living op of in_var to be any last living op of out_var

--- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h
@@ -18,6 +18,7 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h"
@@ -92,6 +93,12 @@ class MemoryReusePass : public Pass {

  int64_t GetMemorySize(const details::VarHandle &var) const;

+  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
+                   details::VarHandle *out_var, bool share_dims = false) const;
+  virtual void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
+                                     details::VarHandle *in_var,
+                                     details::VarHandle *out_var) const;
+
 private:
  VarDesc *GetVarDesc(const details::VarHandle &var) const;

@@ -109,13 +116,6 @@ class MemoryReusePass : public Pass {

  void CollectReusedVars() const;

-  void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var,
-                   details::VarHandle *out_var) const;
-
-  void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op,
-                             details::VarHandle *in_var,
-                             details::VarHandle *out_var) const;
-
 private:
  mutable Graph *graph_;
  mutable bool use_cuda_;

--- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc
@@ -176,7 +176,8 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern,
              return false;
            }
            if (x->IsVar() && x->Var() && x->Var()->GetShape().size() > 2) {
-              LOG(WARNING) << "repeated fc relu only supports input dims = 2";
+              VLOG(3) << "repeated fc relu only supports input dims = 2, so it "
+                         "is not applied.";
              return false;
            }
            int fc_idx = FindFCIdx(x);

--- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <string>
 #include <unordered_set>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -98,3 +99,9 @@ void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const {

 REGISTER_PASS(seqconv_eltadd_relu_fuse_pass,
              paddle::framework::ir::SeqConvEltAddReluFusePass);
+REGISTER_PASS_CAPABILITY(seqconv_eltadd_relu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("sequence_conv", 0)
+            .EQ("elementwise_add", 0)
+            .EQ("relu", 0));
--- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
+++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc
@@ -35,8 +35,6 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
  const std::string pattern_name = "shufflechannel_pattern";
  FusePassBase::Init(pattern_name, graph);

-  LOG(WARNING) << "There is fluid.layers.shuffle_channel API already, you can "
-                  "use it instead of (reshape + transpose +reshape)";
  GraphPatternDetector gpd;
  auto* x = gpd.mutable_pattern()
                ->NewNode("x")
@@ -85,6 +83,9 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const {
    // Delete the unneeded nodes.
    GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op,
                                 transpose_out, reshape2_op});
+    LOG_FIRST_N(WARNING, 1)
+        << "There is fluid.layers.shuffle_channel API already, maybe you can "
+           "use it instead of (reshape + transpose + reshape)";
  };

  gpd(graph, handler);

--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"

 namespace paddle {
 namespace framework {
@@ -77,7 +78,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
  };

  auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) {
-    bool basic = var_is_op_input(x, "matmul", arg_name) &&
+    bool basic = (var_is_op_input(x, "matmul_v2", arg_name) ||
+                  var_is_op_input(x, "matmul", arg_name)) &&
                 var_is_op_input(x, "square", "X");
    if (!basic) {
      return false;
@@ -88,7 +90,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
    }
    auto* squared_x = squared_x_op->outputs[0];
    bool next_is_matmul_from_arg =
-        var_is_op_input(squared_x, "matmul", arg_name) &&
+        (var_is_op_input(squared_x, "matmul_v2", arg_name) ||
+         var_is_op_input(squared_x, "matmul", arg_name)) &&
        squared_x->outputs.size() == 1 &&
        squared_x->outputs[0]->outputs.size() == 1;
    if (!next_is_matmul_from_arg) {
@@ -103,7 +106,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,
  auto is_fusion_first_mul_out = [=](Node* x) -> bool {
    bool input_is_matmul_op = x && x->inputs.size() == 1 &&
                              x->inputs[0]->IsOp() &&
-                              x->inputs[0]->Op()->Type() == "matmul";
+                              (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                               x->inputs[0]->Op()->Type() == "matmul");
    if (!input_is_matmul_op) {
      return false;
    }
@@ -167,7 +171,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,

  auto* matmul_xy_op = pattern->NewNode(
      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
               is_fusion_first_mul_out(x->outputs[0]);
      },
      name_scope + "/matmul_xy_op");
@@ -189,7 +194,9 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,

  auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool {
    bool basic = x && x->IsVar() && x->inputs.size() == 1 &&
-                 x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul";
+                 x->inputs[0]->IsOp() &&
+                 (x->inputs[0]->Op()->Type() == "matmul_v2" ||
+                  x->inputs[0]->Op()->Type() == "matmul");
    if (!basic) {
      return false;
    }
@@ -206,7 +213,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern,

  auto* matmul_squared_x_y_op = pattern->NewNode(
      [=](Node* x) {
-        return x && x->IsOp() && x->Op()->Type() == "matmul" &&
+        return x && x->IsOp() && (x->Op()->Type() == "matmul_v2" ||
+                                  x->Op()->Type() == "matmul") &&
               is_fusion_mat_squared_x_y_op_out(x->outputs[0]);
      },
      name_scope + "/matmul_squared_x_y_op");
@@ -378,3 +386,13 @@ void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const {

 REGISTER_PASS(squared_mat_sub_fuse_pass,
              paddle::framework::ir::SquaredMatSubFusePass);
+REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("matmul", 0)
+            .EQ("matmul_v2", 0)
+            .EQ("square", 0)
+            .EQ("elementwise_mul", 0)
+            .EQ("elementwise_sub", 0)
+            .EQ("fill_constant", 0)
+            .EQ("fusion_squared_mat_sub", 0));
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h
@@ -24,7 +24,7 @@ namespace framework {
 namespace ir {

 /**
- * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar
+ * Fuse ( (A * B).^2 - (A.^2 * B.^2) ) .* scalar
 */
 class SquaredMatSubFusePass : public FusePassBase {
 public:

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -157,6 +157,14 @@ class OperatorBase {
        platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
    return BOOST_GET_CONST(T, attrs_.at(name));
  }
+  void SetAttr(const std::string& name, const Attribute& v) {
+    PADDLE_ENFORCE_EQ(
+        HasAttr(name), true,
+        platform::errors::NotFound(
+            "The attribute %s is not found in operator %s", name, Type()));
+
+    attrs_[name] = v;
+  }
  const AttributeMap& Attrs() const { return attrs_; }

  const VariableNameMap& Inputs() const { return inputs_; }

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/parallel_executor.h"
+
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/details/async_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
@@ -108,6 +110,11 @@ class ParallelExecutorPrivate {
   *                                       them.
   */
  inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) {
+    if (mem_opt_var_infos_.size() == 0) {
+      VLOG(4) << "The mem_opt_var_infos_ is empty, maybe no memory "
+                 "optimization strategy is enabled";
+      return;
+    }
    auto iter = mem_opt_var_infos_[scope_idx].find(name);
    if (iter != mem_opt_var_infos_[scope_idx].end()) {
      iter->second->SetSkipMemoryReuse(true);
@@ -308,6 +315,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
  }

  bool need_mem_opt = build_strategy_.enable_inplace_ ||
+                      build_strategy_.enable_addto_ ||
                      build_strategy_.memory_optimize_.get() || is_gc_enabled;

  if (!need_mem_opt) return graph;
@@ -320,6 +328,16 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
  graph = ref_cnt_pass->Apply(graph);
  VLOG(10) << "ReferenceCountPass Applied";

+  if (build_strategy_.enable_addto_) {
+    auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass");
+    addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_);
+    addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars);
+    addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_);
+    VLOG(10) << "Start to apply inplace_addto_op_pass";
+    graph = addto_pass->Apply(graph);
+    VLOG(10) << "inplace_addto_op_pass Applied";
+  }
+
  if (build_strategy_.enable_inplace_) {
    auto inplace_pass =
        ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass");
@@ -1068,3 +1086,4 @@ USE_PASS(reference_count_pass);
 USE_PASS(eager_deletion_pass);
 USE_PASS(buffer_shared_inplace_pass);
 USE_PASS(buffer_shared_cross_op_memory_reuse_pass);
+USE_PASS(inplace_addto_op_pass);
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -44,10 +44,11 @@ add_subdirectory(api)
 set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
     zero_copy_tensor reset_tensor_array 
        analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
-if(WIN32)
+# TODO(xingzhaolong, jiweibo): remove this and create_static_lib(paddle_fluid) on windows GPU
+if(WIN32 AND WITH_GPU)
  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_API}) 
 else()
- create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
+  create_static_lib(paddle_fluid ${fluid_modules} ${STATIC_INFERENCE_API}) 
 endif()

 if(NOT APPLE AND NOT WIN32)

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1048,6 +1048,7 @@ void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
    const AnalysisConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
  return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
      config);
 }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -373,6 +373,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
    const NativeConfig &config) {
+  LOG(WARNING) << "Deprecated. Please use CreatePredictor instead.";
  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
 }


--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -51,8 +51,8 @@ if (WIN32)
    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
+    safe_set_static_flag()
    if (WITH_STATIC_LIB)
-      safe_set_static_flag()
      add_definitions(-DSTATIC_LIB)
    endif()
  endif()
@@ -136,7 +136,7 @@ else()
  set(DEPS ${DEPS}
      ${MATH_LIB} ${MKLDNN_LIB}
      glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
-  set(DEPS ${DEPS} libcmt shlwapi.lib)
+  set(DEPS ${DEPS} shlwapi.lib)
 endif(NOT WIN32)

 if(WITH_GPU)

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -6,8 +6,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
 DATA_DIR=$4 # dataset
 TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
 TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-
-inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir
+MSVC_STATIC_CRT=$7
+inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir

 cd `dirname $0`
 current_dir=`pwd`
@@ -66,43 +66,54 @@ mkdir -p build
 cd build
 rm -rf *

-if [ $(echo `uname` | grep "Win") != "" ]; then
-  # -----simple_on_word2vec on windows-----
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=simple_on_word2vec \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  Release/simple_on_word2vec.exe \
-      --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-      --use_gpu=False
-  if [ $? -ne 0 ]; then
-    echo "simple_on_word2vec demo runs fail."
-    exit 1
-  fi
-
-  # -----vis_demo on windows-----
-  rm -rf *
-  cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-    -DWITH_MKL=$TURN_ON_MKL \
-    -DDEMO_NAME=vis_demo \
-    -DWITH_GPU=$TEST_GPU_CPU \
-    -DWITH_STATIC_LIB=OFF
-  msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
-  for vis_demo_name in $vis_demo_list; do
-    Release/vis_demo.exe \
-      --modeldir=$DATA_DIR/$vis_demo_name/model \
-      --data=$DATA_DIR/$vis_demo_name/data.txt \
-      --refer=$DATA_DIR/$vis_demo_name/result.txt \
-      --use_gpu=False
-    if [ $? -ne 0 ]; then
-      echo "vis demo $vis_demo_name runs fail."
-      exit 1
+for WITH_STATIC_LIB in ON OFF; do
+  if [ $(echo `uname` | grep "Win") != "" ]; then
+    # TODO(xingzhaolong, jiweibo): remove this if windows GPU library is ready.
+    if [ $TEST_GPU_CPU == ON] && [ $WITH_STATIC_LIB ==ON ]; then
+      return 0
    fi
-  done
-else
-  for WITH_STATIC_LIB in ON OFF; do
+    
+    # -----simple_on_word2vec on windows-----
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=simple_on_word2vec \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      Release/simple_on_word2vec.exe \
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
+        echo "simple_on_word2vec demo runs fail."
+        exit 1
+      fi
+    done
+
+    # -----vis_demo on windows-----
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
+      -DDEMO_NAME=vis_demo \
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+    msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
+    for use_gpu in $use_gpu_list; do
+      for vis_demo_name in $vis_demo_list; do
+        Release/vis_demo.exe \
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
+          echo "vis demo $vis_demo_name runs fail."
+          exit 1
+        fi
+      done
+    done
+  else
    # -----simple_on_word2vec on linux/mac-----
    rm -rf *
    cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -123,7 +134,6 @@ else
        fi
      done
    fi
-
    # ---------vis_demo on linux/mac---------
    rm -rf *
    cmake .. -DPADDLE_LIB=${inference_install_dir} \
@@ -145,7 +155,6 @@ else
        fi
      done
    done
-
    # --------tensorrt mobilenet on linux/mac------
    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
      rm -rf *
@@ -167,6 +176,6 @@ else
        exit 1
      fi
    fi
-  done
-fi
+  fi
+done
 set +x
--- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
+++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat
@@ -21,7 +21,7 @@ if /i "%use_mkl%"=="N" (
 )

 :set_paddle_infernece_lib
-SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\fluid_inference_install_dir   =======>"
+SET /P paddle_infernece_lib="Please input the path of paddle inference library, such as D:\paddle_inference_install_dir   =======>"
 set tmp_var=!paddle_infernece_lib!
 call:remove_space
 set paddle_infernece_lib=!tmp_var!

--- a/paddle/fluid/inference/api/paddle_infer_declare.h
+++ b/paddle/fluid/inference/api/paddle_infer_declare.h
@@ -17,11 +17,7 @@
 #if defined(_WIN32)
 #ifndef PD_INFER_DECL
 #ifdef PADDLE_DLL_INFERENCE
-#ifndef PADDLE_ON_INFERENCE
-#define PD_INFER_DECL
-#else
 #define PD_INFER_DECL __declspec(dllexport)
-#endif  // PADDLE_ON_INFERENCE
 #else
 #define PD_INFER_DECL __declspec(dllimport)
 #endif  // PADDLE_DLL_INFERENCE

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -156,7 +156,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
                  // "seqpool_concat_fuse_pass",    //
                  "seqpool_cvm_concat_fuse_pass",  //
                  // "embedding_fc_lstm_fuse_pass", //
-                  "fc_lstm_fuse_pass",                       //
+                  // TODO(wilber): fix correctness problem.
+                  // "fc_lstm_fuse_pass",                       //
                  "mul_lstm_fuse_pass",                      //
                  "fc_gru_fuse_pass",                        //
                  "mul_gru_fuse_pass",                       //

--- a/paddle/fluid/inference/capi/pd_predictor.cc
+++ b/paddle/fluid/inference/capi/pd_predictor.cc
@@ -130,7 +130,10 @@ bool PD_PredictorZeroCopyRun(const PD_AnalysisConfig* config,
  VLOG(3) << "The inputs' size is " << input_names.size();
  PADDLE_ENFORCE_EQ(
      input_names.size(), in_size,
-      "The number of input and the number of model's input must match. ");
+      paddle::platform::errors::InvalidArgument(
+          "The number of input and the number of model's input must match. The "
+          "number of input is %d, the number of model's input is %d.",
+          input_names.size(), in_size));
  for (int i = 0; i < in_size; ++i) {
    auto input_t = predictor->GetInputTensor(inputs[i].name);
    std::vector<int> tensor_shape;

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -47,7 +47,9 @@ void Init(const std::vector<std::string> argv) {

 void ReadBinaryFile(const std::string& filename, std::string* contents) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
+  PADDLE_ENFORCE_EQ(
+      fin.is_open(), true,
+      platform::errors::Unavailable("Failed to open file %s.", filename));
  fin.seekg(0, std::ios::end);
  contents->clear();
  contents->resize(fin.tellg());
@@ -133,9 +135,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,

  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));

  // model_from_memory is false in separate parameters.
  LoadPersistables(executor, scope, *main_program, dirname, "",
@@ -151,9 +154,10 @@ std::unique_ptr<framework::ProgramDesc> Load(

  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(program_desc_str));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));

  LoadPersistables(executor, scope, *main_program, "", param_filename,
                   false /* model_from_memory */);
@@ -165,9 +169,10 @@ std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
    const std::string& prog_buffer, const std::string& param_buffer) {
  std::unique_ptr<framework::ProgramDesc> main_program(
      new framework::ProgramDesc(prog_buffer));
-  PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
-                 "model version %ld is not supported.",
-                 main_program->Version());
+  PADDLE_ENFORCE_EQ(
+      framework::IsProgramVersionSupported(main_program->Version()), true,
+      platform::errors::Unavailable("Model version %ld is not supported.",
+                                    main_program->Version()));

  LoadPersistables(executor, scope, *main_program, "", param_buffer,
                   true /* model_filename */);

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -25,8 +25,10 @@ PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
  const char* plugin_type;
  DeserializeValue(&serial_data, &serial_length, &plugin_type);

-  PADDLE_ENFORCE(Has(plugin_type),
-                 "trt plugin type %s does not exists, check it.", plugin_type);
+  PADDLE_ENFORCE_EQ(
+      Has(plugin_type), true,
+      platform::errors::NotFound("TensorRT plugin type `%s` does not exists.",
+                                 plugin_type));
  auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
  owned_plugins_.emplace_back(plugin);


--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -103,7 +103,11 @@ struct Serializer<std::vector<T>,
    DeserializeValue(buffer, buffer_size, &size);
    value->resize(size);
    size_t nbyte = value->size() * sizeof(T);
-    PADDLE_ENFORCE_GE(*buffer_size, nbyte);
+    PADDLE_ENFORCE_GE(*buffer_size, nbyte,
+                      platform::errors::InvalidArgument(
+                          "Insufficient data in buffer, expect contains %d "
+                          "byte, but actually only contains %d byte.",
+                          *buffer_size, nbyte));
    std::memcpy(value->data(), *buffer, nbyte);
    reinterpret_cast<char const*&>(*buffer) += nbyte;
    *buffer_size -= nbyte;

--- a/paddle/fluid/inference/utils/singleton.h
+++ b/paddle/fluid/inference/utils/singleton.h
@@ -46,7 +46,9 @@ struct Registry {

  template <typename ItemChild>
  void Register(const std::string& name) {
-    PADDLE_ENFORCE_EQ(items_.count(name), 0);
+    PADDLE_ENFORCE_EQ(items_.count(name), 0,
+                      platform::errors::AlreadyExists(
+                          "Item `%s` has beed registered.", name));
    items_[name] = new ItemChild;
  }


--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -92,7 +92,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP

 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
 lod_tensor maxouting unpooling pooling lod_rank_table context_project
-sequence_pooling executor device_memory_aligment generator)
+sequence_pooling segment_pooling executor device_memory_aligment generator)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)

--- a/paddle/fluid/operators/add_position_encoding_op.cc
+++ b/paddle/fluid/operators/add_position_encoding_op.cc
@@ -69,12 +69,18 @@ class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<float>("alpha", "The scale of Original Embedding.")
        .SetDefault(1.0f)
        .AddCustomChecker([](const float& alpha) {
-          PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0.");
+          PADDLE_ENFORCE_GE(
+              alpha, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'alpha' must be greater than or equal to 0.0."));
        });
    AddAttr<float>("beta", "The scale of Position Embedding.")
        .SetDefault(1.0f)
        .AddCustomChecker([](const float& beta) {
-          PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0.");
+          PADDLE_ENFORCE_GE(
+              beta, 0.0f,
+              platform::errors::InvalidArgument(
+                  "Attribute 'beta' must be greater than or equal to 0.0."));
        });
    AddComment(R"DOC(
    Add Position Encoding Operator.

--- a/paddle/fluid/operators/argsort_op.cu
+++ b/paddle/fluid/operators/argsort_op.cu
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <thrust/copy.h>
 #include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include "cub/cub.cuh"
 #include "paddle/fluid/framework/op_registry.h"
@@ -58,6 +60,16 @@ static __global__ void FillIndex(T* indices, T num_rows, T num_cols) {
  }
 }

+template <typename T, typename IndType>
+static __global__ void FillFlattenGrad(const T* dO, const IndType* indices,
+                                       int64_t size, T* dX) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  int stride = blockDim.x * gridDim.x;
+  for (int i = index; i < size; i += stride) {
+    dX[indices[i]] = dO[i];
+  }
+}
+
 template <typename T, typename IndType>
 static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX,
                                IndType num_rows, IndType num_cols) {
@@ -193,6 +205,23 @@ void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
 }

 template <typename T>
+void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO,
+                      const Tensor* indices, int64_t size, Tensor* dX) {
+  auto cu_stream = ctx.stream();
+
+  const int64_t block_size =
+      std::min(size, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock()));
+  int64_t max_threads = ctx.GetMaxPhysicalThreadCount();
+  const int64_t max_blocks =
+      std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+  const int64_t grid_size =
+      std::min(max_blocks, (size + block_size - 1) / block_size);
+
+  FillFlattenGrad<<<grid_size, block_size, 0, cu_stream>>>(
+      dO->data<T>(), indices->data<int64_t>(), size, dX->data<T>());
+}
+
+template <typename DeviceContext, typename T>
 class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -205,8 +234,25 @@ class ArgsortOpCUDAKernel : public framework::OpKernel<T> {
    auto in_dims = input->dims();
    axis = (axis < 0) ? (in_dims.size() + axis) : axis;

-    int64_t numel = input->numel();
-    int64_t groups = numel / in_dims[axis];
+    const T* in_data = input->data<T>();
+    auto size = input->numel();
+    T* out_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* ids_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    // Use thrust for parallel acceleration when the input size is equal to the
+    // length of the ‘axis’ dimension.
+    // Compared to the following 'Special case for full sort', ascending sort is
+    // 34 times faster and descending sort is 31 times faster.
+    if (size == in_dims[axis]) {
+      thrust::sequence(thrust::device, ids_data, ids_data + size);
+      thrust::copy(thrust::device, in_data, in_data + size, out_data);
+      thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data);
+      if (descending) {
+        thrust::reverse(thrust::device, out_data, out_data + size);
+        thrust::reverse(thrust::device, ids_data, ids_data + size);
+      }
+      return;
+    }

    // Special case for full sort, speedup ~190x.
    if (axis == -1 || axis + 1 == in_dims.size()) {
@@ -276,23 +322,28 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
    int axis = ctx.Attr<int>("axis");

    dX->mutable_data<T>(ctx.GetPlace());
-    auto dxt = framework::EigenVector<T>::Flatten(*dX);
-    auto& place = *ctx.template device_context<platform::CUDADeviceContext>()
-                       .eigen_device();
-    dxt.device(place) = dxt.constant(static_cast<T>(0));
    if (dO->numel() == 0) return;

-    auto in_dims = indices->dims();
+    auto in_dims = dX->dims();
    axis = (axis < 0) ? (in_dims.size() + axis) : axis;

-    int64_t numel = indices->numel();
+    int64_t size = dX->numel();
+    const auto& dev_ctx = ctx.cuda_device_context();
+
+    // Parallel acceleration when the input size is equal to the length of the
+    // ‘axis’ dimension.
+    // Compared to 'special case for full sort' below, the gradient calculation
+    // is 10 times faster.
+    if (size == in_dims[axis]) {
+      ArgFlattenAssign<T>(dev_ctx, dO, indices, size, dX);
+      return;
+    }

    // Special case for full sort, speedup ~190x.
    if (axis == -1 || axis + 1 == in_dims.size()) {
      const int64_t input_height = framework::product(
          framework::slice_ddim(in_dims, 0, in_dims.size() - 1));
      const int64_t input_width = in_dims[in_dims.size() - 1];
-      const auto& dev_ctx = ctx.cuda_device_context();
      ArgFullAssign<T, int64_t>(dev_ctx, dO, indices, dX, input_height,
                                input_width);
    } else {
@@ -316,7 +367,6 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
      Tensor trans_ind;
      trans_ind.mutable_data<int64_t>(trans_dims, ctx.GetPlace());
      int ndims = trans.size();
-      const auto& dev_ctx = ctx.cuda_device_context();
      // Do transpose
      TransCompute<platform::CUDADeviceContext, T>(ndims, dev_ctx, *dO,
                                                   &trans_dO, trans);
@@ -345,11 +395,17 @@ class ArgsortGradOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 REGISTER_OP_CUDA_KERNEL(
-    argsort, paddle::operators::ArgsortOpCUDAKernel<float>,
-    paddle::operators::ArgsortOpCUDAKernel<double>,
-    paddle::operators::ArgsortOpCUDAKernel<int>,
-    paddle::operators::ArgsortOpCUDAKernel<int64_t>,
-    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::float16>);
+    argsort,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           int64_t>,
+    paddle::operators::ArgsortOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                           paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
    argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel<float>,
    paddle::operators::ArgsortGradOpCUDAKernel<double>,

--- a/paddle/fluid/operators/assign_value_op.h
+++ b/paddle/fluid/operators/assign_value_op.h
@@ -76,7 +76,10 @@ class AssignValueKernel : public framework::OpKernel<T> {
        value_name = "int64_values";
        break;
      default:
-        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type(code %d) for AssignValue operator, only "
+            "supports bool, int32, float32 and int64.",
+            dtype));
        break;
    }
    CopyVecotorToTensor<T>(value_name, out, ctx);

--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -831,6 +831,401 @@ void BatchNormGradMaker<T>::Apply(GradOpPtr<T> op) const {
  op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
 }

+template <typename T>
+void BatchNormDoubleGradMaker<T>::Apply(GradOpPtr<T> op) const {
+  op->SetType("batch_norm_grad_grad");
+  op->SetInput("X", this->Input("X"));
+  op->SetInput("Scale", this->Input("Scale"));
+  op->SetInput("SavedMean", this->Input("SavedMean"));
+  op->SetInput("SavedVariance", this->Input("SavedVariance"));
+  if (BOOST_GET_CONST(bool, this->GetAttr("use_global_stats"))) {
+    op->SetInput("Variance", this->Input("Variance"));
+  }
+  op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+  op->SetInput("DDScale", this->OutputGrad(framework::GradVarName("Scale")));
+  op->SetInput("DDBias", this->OutputGrad(framework::GradVarName("Bias")));
+  op->SetInput("DY", this->Input(framework::GradVarName("Y")));
+
+  op->SetAttrMap(this->Attrs());
+  op->SetOutput("DX", this->InputGrad("X"));
+  op->SetOutput("DScale", this->InputGrad("Scale"));
+  op->SetOutput("DDY", this->InputGrad(framework::GradVarName("Y")));
+}
+
+void BatchNormDoubleGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "BatchNormDoubleGrad");
+
+  const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
+  if (use_global_stats) {
+    OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "VarianceOut",
+                   "BatchNormDoubleGrad");
+  }
+
+  OP_INOUT_CHECK(ctx->HasInput("DDX"), "Input", "DDX", "BatchNormDoubleGrad");
+  OP_INOUT_CHECK(ctx->HasInput("DY"), "Input", "DY", "BatchNormDoubleGrad");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("DX"), "Output", "DX", "BatchNormDoubleGrad");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const int C = x_dims[1];
+  if (ctx->HasOutput("DX")) {
+    ctx->SetOutputDim("DX", x_dims);
+  }
+  if (ctx->HasOutput("DScale")) {
+    ctx->SetOutputDim("DScale", {C});
+  }
+  if (ctx->HasOutput("DDY")) {
+    ctx->ShareDim("X", "DDY");
+  }
+}
+
+framework::OpKernelType BatchNormDoubleGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar("DY");
+  if (var == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("cannot find gradient variable of Y"));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::InvalidArgument("gradient variable of Y is empty"));
+  }
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+}
+
+template <typename T>
+class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+    dX->mutable_data<T>(ctx.GetPlace());
+    ddY->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+
+    const auto &x_dims = X->dims();
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = X->numel() / C;
+    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+
+    const T *mean_data = Saved_mean->data<T>();
+    const T *inv_var_data = Saved_variance->data<T>();
+
+    Tensor inv_var_tensor;
+    if (use_global_stats) {
+      const auto *running_variance = ctx.Input<Tensor>("Variance");
+      inv_var_tensor.Resize({C});
+
+      T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
+      ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
+
+      inv_var_tmp = (var_arr + epsilon).sqrt().inverse();
+      inv_var_data = running_inv_var_data;
+    }
+
+    // transpose NCHW -> NHWC for easy calculate
+    Tensor transformed_x(X->type());
+    Tensor transformed_dy(dY->type());
+    Tensor transformed_ddx(ddX->type());
+
+    Tensor transformed_dx(dX->type());
+    Tensor transformed_ddy(ddY->type());
+    if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) {
+      VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
+      // Input Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, X,
+                                                         &transformed_x);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, X, &transformed_x);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                         &transformed_dy);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, dY,
+                                                        &transformed_dy);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                         &transformed_ddx);
+      TransToChannelLast<platform::CPUDeviceContext, T>(ctx, ddX,
+                                                        &transformed_ddx);
+      // Output Tensor
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, dX,
+                                                         &transformed_dx);
+      ResizeToChannelLast<platform::CPUDeviceContext, T>(ctx, ddY,
+                                                         &transformed_ddy);
+    } else {
+      transformed_x.ShareDataWith(*X);
+      transformed_dy.ShareDataWith(*dY);
+      transformed_ddx.ShareDataWith(*ddX);
+
+      transformed_dx.ShareDataWith(*dX);
+      transformed_ddy.ShareDataWith(*ddY);
+    }
+
+    ConstEigenArrayMap<T> x_arr(transformed_x.data<T>(), C, sample_size);
+    ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
+
+    Tensor mean_tile;
+    mean_tile.Resize({C, sample_size});
+    mean_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> mean_tile_data(mean_tile.mutable_data<T>(ctx.GetPlace()),
+                                    C, sample_size);
+
+    Tensor inv_var_tile;
+    inv_var_tile.Resize({C, sample_size});
+    inv_var_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> inv_var_tile_data(
+        inv_var_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+
+    mean_tile_data = mean_arr.replicate(1, sample_size);
+    inv_var_tile_data = inv_var_arr.replicate(1, sample_size);
+
+    Tensor Scale_data;
+    if (!Scale) {
+      Scale_data.mutable_data<T>({C}, ctx.GetPlace());
+      set_constant(dev_ctx, &Scale_data, static_cast<T>(1));
+    }
+    ConstEigenVectorArrayMap<T> scale_arr(
+        Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
+
+    Tensor scale_tile;
+    scale_tile.Resize({C, sample_size});
+    scale_tile.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> scale_tile_data(scale_tile.mutable_data<T>(ctx.GetPlace()),
+                                     C, sample_size);
+    scale_tile_data = scale_arr.replicate(1, sample_size);
+
+    ConstEigenArrayMap<T> dy_arr(transformed_dy.data<T>(), C, sample_size);
+    ConstEigenArrayMap<T> ddx_arr(transformed_ddx.data<T>(), C, sample_size);
+
+    Tensor x_sub_mean_mul_invstd;
+    x_sub_mean_mul_invstd.Resize({C, sample_size});
+    x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace());
+    EigenArrayMap<T> x_sub_mean_mul_invstd_arr(
+        x_sub_mean_mul_invstd.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+    x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data;
+
+    if (dX) {
+      dX->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> dx_arr(transformed_dx.mutable_data<T>(ctx.GetPlace()), C,
+                              sample_size);
+      dx_arr.setZero();
+      if (use_global_stats) {
+        // math: dx = (ddscale * dy) * inv_var
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data;
+        }
+      } else {
+        // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx,
+        // axis=(n,h,w)) *
+        //          np.sum(dy, axis=(n,h,w)) -
+        //          np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x -
+        //          mean),
+        //          axis=(n,h,w)) * inv_var.pow(2) *
+        //          np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) /
+        //          NxHxW *
+        //          np.sum(ddx * (x - mean)) *
+        //          (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW *
+        //          np.sum(dy,
+        //          axis=(n,h,w)) * (x - mean) *
+        //          (np.mean(ddx, axis=(n,h,w)) - ddx) + ddr * (dy * inv_var -
+        //          inv_var
+        //          *
+        //          np.mean(dy, axis=(n,h,w)) -
+        //          inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean),
+        //          axis=(n,h,w))))
+
+        if (ddX) {
+          dx_arr +=
+              (x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+               inv_var_tile_data / sample_size)
+                  .colwise() *
+              (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size -
+               (dy_arr * ddx_arr).rowwise().sum() +
+               3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                   sample_size);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (dy_arr.rowwise().sum() / sample_size - dy_arr);
+
+          dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() *
+                    (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() /
+                    sample_size *
+                    (ddx_arr.rowwise().sum() / sample_size - ddx_arr);
+
+          dx_arr = scale_tile_data * dx_arr;
+        }
+        if (ddScale) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          dx_arr += (dy_arr * inv_var_tile_data -
+                     (dy_arr.rowwise().sum().replicate(1, sample_size) /
+                      sample_size) *
+                         inv_var_tile_data -
+                     x_sub_mean_mul_invstd_arr * inv_var_tile_data *
+                         (dy_arr * x_sub_mean_mul_invstd_arr)
+                             .rowwise()
+                             .sum()
+                             .replicate(1, sample_size) /
+                         sample_size) *
+                    ddscale_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_dx, dX);
+      }
+    }
+    if (dScale) {
+      dScale->mutable_data<T>(ctx.GetPlace());
+      EigenVectorArrayMap<T> dscale_arr(dScale->mutable_data<T>(ctx.GetPlace()),
+                                        C);
+      dscale_arr.setZero();
+      if (use_global_stats) {
+        // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var
+        if (ddX) {
+          dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum();
+        }
+      } else {
+        // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) *
+        //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
+        //            ddx
+        if (ddX) {
+          Tensor first_grad;
+          first_grad.Resize({C, sample_size});
+          EigenArrayMap<T> first_grad_arr(
+              first_grad.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          first_grad_arr.setZero();
+
+          first_grad_arr +=
+              inv_var_tile_data *
+              (dy_arr -
+               dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (dy_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+          dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum();
+        }
+      }
+    }
+
+    if (ddY) {
+      ddY->mutable_data<T>(ctx.GetPlace());
+      EigenArrayMap<T> ddy_arr(transformed_ddy.mutable_data<T>(ctx.GetPlace()),
+                               C, sample_size);
+      ddy_arr.setZero();
+      if (use_global_stats) {
+        // math: ddy = r * ddx * inv_var
+        if (ddX) {
+          ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data;
+        }
+      } else {
+        // math: ddy = (x - mean) * inv_var * ddscale + ddbias +
+        //           scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) *
+        //           np.mean(ddx * (x - mean), axis=(n,h,w)))
+        if (ddX) {
+          ddy_arr +=
+              scale_tile_data * inv_var_tile_data *
+              (ddx_arr -
+               ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size -
+               x_sub_mean_mul_invstd_arr *
+                   (ddx_arr * x_sub_mean_mul_invstd_arr)
+                       .rowwise()
+                       .sum()
+                       .replicate(1, sample_size) /
+                   sample_size);
+        }
+        if (ddScale && ddBias) {
+          ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
+          Tensor ddscale_tile;
+          ddscale_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddscale_tile_data(
+              ddscale_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddscale_tile_data = ddscale_arr.replicate(1, sample_size);
+
+          ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
+          Tensor ddbias_tile;
+          ddbias_tile.Resize({C, sample_size});
+          EigenArrayMap<T> ddbias_tile_data(
+              ddbias_tile.mutable_data<T>(ctx.GetPlace()), C, sample_size);
+          ddbias_tile_data = ddbias_arr.replicate(1, sample_size);
+
+          ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data;
+          ddy_arr += ddbias_tile_data;
+        }
+      }
+      if (data_layout == DataLayout::kNCHW) {
+        VLOG(3) << "Transform batchnorm output from NHWC to NCHW";
+        TransToChannelFirst<paddle::platform::CPUDeviceContext, T>(
+            ctx, &transformed_ddy, ddY);
+      }
+    }
+  }
+};
+
+DECLARE_INPLACE_OP_INFERER(BatchNormDoubleGradOpInplaceInferer, {"DY", "DDY"});
+
 }  // namespace operators
 }  // namespace paddle

@@ -839,7 +1234,11 @@ REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
                  ops::BatchNormOpInferVarType,
                  ops::BatchNormGradMaker<paddle::framework::OpDesc>,
                  ops::BatchNormGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp,
+                  ops::BatchNormDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::BatchNormDoubleGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp,
+                  ops::BatchNormDoubleGradOpInplaceInferer);

 REGISTER_OP_CPU_KERNEL(
    batch_norm, ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>,
@@ -848,3 +1247,7 @@ REGISTER_OP_CPU_KERNEL(
    batch_norm_grad,
    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include "paddle/fluid/platform/float16.h"

@@ -840,6 +841,45 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
  }
 };

+template <typename T>
+class BatchNormDoubleGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *X = ctx.Input<Tensor>("X");
+    const auto *Scale = ctx.Input<Tensor>("Scale");
+    const auto *dY = ctx.Input<Tensor>("DY");
+    const auto *Saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *Saved_variance = ctx.Input<Tensor>("SavedVariance");
+    const double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
+    const bool is_test = ctx.Attr<bool>("is_test");
+
+    PADDLE_ENFORCE_EQ(
+        is_test, false,
+        platform::errors::InvalidArgument(
+            "`is_test = True` CANNOT be used in train program. If "
+            "you want to use global status in pre_train model, "
+            "please set `use_global_stats = True`"));
+
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *ddX = ctx.Input<Tensor>("DDX");
+    const auto *ddScale = ctx.Input<Tensor>("DDScale");
+    const auto *ddBias = ctx.Input<Tensor>("DDBias");
+
+    auto *dX = ctx.Output<Tensor>("DX");
+    auto *dScale = ctx.Output<Tensor>("DScale");
+    auto *ddY = ctx.Output<Tensor>("DDY");
+
+    NormDoubleGradFunctor<platform::CUDADeviceContext, T>(
+        ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon,
+        use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

@@ -853,3 +893,7 @@ REGISTER_OP_CUDA_KERNEL(
    batch_norm_grad, ops::BatchNormGradKernel<plat::CUDADeviceContext, float>,
    ops::BatchNormGradKernel<plat::CUDADeviceContext, double>,
    ops::BatchNormGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm_grad_grad,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, float>,
+    ops::BatchNormDoubleGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -103,6 +103,42 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
  }
 }

+template <typename DeviceContext, typename T>
+inline void ResizeToChannelLast(const framework::ExecutionContext& context,
+                                const Tensor* input,
+                                Tensor* transformed_input) {
+  int dim = input->dims().size() - 2;
+  if (dim == 3) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[4];
+    in_dims_vec[4] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+
+  } else if (dim == 2) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[3];
+    in_dims_vec[3] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  } else if (dim == 1) {
+    transformed_input->Resize(input->dims());
+
+    auto in_dims_vec = framework::vectorize(input->dims());
+    in_dims_vec[1] = input->dims()[2];
+    in_dims_vec[2] = input->dims()[1];
+    transformed_input->Resize(framework::make_ddim(in_dims_vec));
+    transformed_input->mutable_data<T>(context.GetPlace());
+  }
+}
+
 template <typename DeviceContext, typename T>
 inline void TransToChannelLast(const framework::ExecutionContext& context,
                               const Tensor* input, Tensor* transformed_input) {
@@ -154,6 +190,16 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
      const framework::OpKernelType& expected_kernel_type) const override;
 };

+class BatchNormDoubleGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
 class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override;
@@ -168,6 +214,15 @@ class BatchNormGradMaker : public framework::SingleGradOpMaker<T> {
  void Apply(GradOpPtr<T> op) const override;
 };

+template <typename T>
+class BatchNormDoubleGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override;
+};
+
 class BatchNormOpInferVarType
    : public framework::PassInDtypeAndVarTypeToOutput {
 protected:
@@ -190,5 +245,11 @@ class BatchNormGradKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override;
 };

+template <typename DeviceContext, typename T>
+class BatchNormDoubleGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -33,29 +33,37 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
    auto out_vars = context.MultiOutputVar("Output");

    PADDLE_ENFORCE_GT(in_var_names.size(), static_cast<size_t>(0),
-                      "The CoalesceTensorOp has no input.");
-    PADDLE_ENFORCE_EQ(
-        in_var_names.size(), out_var_names.size(),
-        "The number of CoalesceTensorOp's input and output is not match.");
+                      platform::errors::InvalidArgument(
+                          "The CoalesceTensor operator has no input."));
+    PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size(),
+                      platform::errors::InvalidArgument(
+                          "The number of CoalesceTensor operator's input and "
+                          "output is not match, "
+                          "input number is %u, output number is %u.",
+                          in_var_names.size(), out_var_names.size()));

    // Input & Output check: only support LoDTensor
    for (size_t i = 0; i < in_var_names.size(); ++i) {
      PADDLE_ENFORCE_NOT_NULL(
          in_vars[i],
-          "The input variable %s of CoalesceTensorOp does not exist.",
-          in_var_names[i]);
+          platform::errors::NotFound("The input variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     in_var_names[i]));
      PADDLE_ENFORCE_NOT_NULL(
          out_vars[i],
-          "The output variable %s of CoalesceTensorOp does not exist.",
-          out_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          in_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The input variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
-      PADDLE_ENFORCE_EQ(
-          out_vars[i]->IsType<framework::LoDTensor>(), true,
-          "The output variable %s of CoalesceTensorOp is not LoDTensor.",
-          in_var_names[i]);
+          platform::errors::NotFound("The output variable %s of CoalesceTensor "
+                                     "operator does not exist.",
+                                     out_var_names[i]));
+      PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The input variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
+      PADDLE_ENFORCE_EQ(out_vars[i]->IsType<framework::LoDTensor>(), true,
+                        platform::errors::InvalidArgument(
+                            "The output variable %s of CoalesceTensor operator "
+                            "is not LoDTensor.",
+                            in_var_names[i]));
    }

    auto in_tensors = context.MultiInput<framework::LoDTensor>("Input");
@@ -64,7 +72,10 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      for (size_t i = 0; i < in_var_names.size(); ++i) {
        PADDLE_ENFORCE_EQ(
            in_var_names[i], out_var_names[i],
-            "The input and output variable of CoalesceTensorOp is different.");
+            platform::errors::InvalidArgument(
+                "The input and output variable of CoalesceTensor operator is "
+                "different, %dth input is %s, %dth output is %s.",
+                i, in_var_names[i], i, out_var_names[i]));
      }
    } else {
      // Init the output as input
@@ -134,16 +145,25 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
      const std::vector<const framework::LoDTensor *> &lod_tensors,
      const std::vector<std::string> var_names, size_t *numel,
      const size_t &size_of_dtype, const platform::Place &place) const {
-    PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size());
+    PADDLE_ENFORCE_EQ(
+        lod_tensors.size(), var_names.size(),
+        platform::errors::InvalidArgument(
+            "The number of input tensor and variable does not match, the "
+            "number of input tensor is %u, the number of input variable is %u.",
+            lod_tensors.size(), var_names.size()));
    *numel = 0;
    std::stringstream ss;
    ss << "alloc_space_for_vars: ";
    for (size_t i = 0; i < var_names.size(); ++i) {
      PADDLE_ENFORCE_EQ(lod_tensors[i]->IsInitialized(), true,
-                        "%s is not initialized.", var_names[i]);
+                        platform::errors::InvalidArgument(
+                            "Tensor `%s` is not initialized.", var_names[i]));

      auto size = lod_tensors[i]->numel();
-      PADDLE_ENFORCE_GT(size, 0);
+      PADDLE_ENFORCE_GT(
+          size, 0,
+          platform::errors::InvalidArgument(
+              "The number of tensor `%s`'s elements is 0.", var_names[i]));
      ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims()
         << ") "
         << " addres:" << lod_tensors[i]->data<void>() << ", ";

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/concat_op.h"
+
 #include <memory>
 #include <string>
 #include <vector>
@@ -78,7 +79,8 @@ class ConcatOp : public framework::OperatorWithKernel {
      }
    }
    if (flag == 0) {
-      PADDLE_THROW("All Inputs of Concat OP are Empty!");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "All Inputs of Concat OP are Empty!"));
    }
 #ifdef PADDLE_WITH_MKLDNN
    if (platform::CanMKLDNNBeUsed(ctx)) {

--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */

 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -287,7 +288,9 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 #endif

    // ------------------- cudnn conv forward ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
    for (int i = 0; i < groups; i++) {
      workspace_handle.RunFunc(
          [&](void* workspace_ptr) {
@@ -609,9 +612,13 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
    }

    // ------------------- cudnn conv backward data ---------------------
-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
    if (input_grad) {
-      // Because beta is zero, it is unnecessary to reset input_grad.
+      // When beta is 0, it is unnecessary to reset input_grad.
+      // When beta is 1, the output cannot be reset since addt strategy used.
      for (int i = 0; i < groups; i++) {
        workspace_handle.RunFunc(
            [&](void* cudnn_workspace_ptr) {
@@ -653,6 +660,9 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
            ctx, &transformed_input_grad_channel, input_grad);
      }
    }
+
+    // filter_grad do not use inplace addto.
+    ScalingParamType<T> beta_filter = 0.0f;
    // ------------------- cudnn conv backward filter ---------------------
    if (filter_grad) {
      // Because beta is zero, it is unnecessary to reset filter_grad.
@@ -665,7 +675,7 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
                      input_data + i * group_offset_in, args2.odesc.desc(),
                      output_grad_data + i * group_offset_out,
                      args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr,
-                      workspace_size, &beta, args2.wdesc.desc(),
+                      workspace_size, &beta_filter, args2.wdesc.desc(),
                      filter_grad_data + i * group_offset_filter));
            },
            workspace_size);
@@ -1017,7 +1027,14 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
    int group_offset_out = o_c / groups * o_h * o_w * o_d;
    int group_offset_filter = W->numel() / groups;

-    ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
+    ScalingParamType<T> alpha = 1.0f;
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f :
+    // 0.0f;
+    // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr<bool>("use_addto");
+
    auto wkspace_handle = dev_ctx.cudnn_workspace_handle();

    if (ddO) {

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -305,6 +305,11 @@ void Conv2DOpMaker::Make() {
      .SetDefault(0.0f);
  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
      .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
  AddAttr<bool>("fuse_residual_connection",
                "(bool, default false) Only used in mkldnn kernel. Used "
                "whenever convolution output is as an input to residual "
@@ -460,6 +465,11 @@ void Conv3DOpMaker::Make() {
      .SetDefault(0.0f);
  AddAttr<float>("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel")
      .SetDefault(0.0f);
+  AddAttr<bool>(
+      "use_addto",
+      "(bool, default false) If use addto strategy or not, only used in "
+      "cudnn kernel")
+      .SetDefault(false);
  AddAttr<bool>("fuse_residual_connection",
                "(bool, default false) Only used in mkldnn kernel. Used "
                "whenever convolution output is as an input to residual "

--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -45,10 +45,8 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      "Input(X) of DequantizeMaxAbsOp should not be null.");
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true,
-                      "Output(Out) of DequantizeMaxAbsOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DequantizeMaxAbs");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DequantizeMaxAbs");

    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");

--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -532,7 +532,8 @@ static int count_contours(polygon_node *polygon) {
 }

 static void add_left(polygon_node *p, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
  vertex_node *nv = NULL;

  /* Create a new vertex node and set its fields */
@@ -588,7 +589,8 @@ static void add_right(polygon_node *p, double x, double y) {
 }

 static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) {
-  PADDLE_ENFORCE_NOT_NULL(p);
+  PADDLE_ENFORCE_NOT_NULL(p, paddle::platform::errors::InvalidArgument(
+                                 "Input polygon node is nullptr."));
  polygon_node *target = NULL;

  /* Label contour as external */
@@ -664,7 +666,8 @@ void add_vertex(vertex_node **t, double x, double y) {
 }

 void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) {
-  PADDLE_ENFORCE_NOT_NULL(e);
+  PADDLE_ENFORCE_NOT_NULL(e, paddle::platform::errors::InvalidArgument(
+                                 "Input edge node is nullptr."));
  add_vertex(&(e->outp[p]->v[s]), x, y);
  e->outp[p]->active++;
 }
@@ -693,7 +696,8 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {

  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                   const_cast<char *>("Bounding box creation"));
-  PADDLE_ENFORCE_NOT_NULL(box);
+  PADDLE_ENFORCE_NOT_NULL(box, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc box memory."));

  /* Construct contour bounding boxes */
  for (c = 0; c < p->num_contours; c++) {
@@ -857,7 +861,9 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
  /* Create an extended hole array */
  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                  const_cast<char *>("contour hole addition"));
-  PADDLE_ENFORCE_NOT_NULL(extended_hole);
+  PADDLE_ENFORCE_NOT_NULL(extended_hole,
+                          paddle::platform::errors::ResourceExhausted(
+                              "Failed to malloc extended hole memory."));

  /* Create an extended contour array */
  gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -975,7 +981,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
  /* Build scanbeam table from scanbeam tree */
  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                     const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
+
  build_sbt(&scanbeam, sbt, sbtree);
  scanbeam = 0;
  free_sbtree(&sbtree);
@@ -1017,7 +1025,9 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
    e0 = aet;
    e1 = aet;
    /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
+
    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
    aet->bundle[ABOVE][!aet->type] = 0;
    aet->bstate[ABOVE] = UNBUNDLED;
@@ -1612,7 +1622,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
  /* Build scanbeam table from scanbeam tree */
  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                     const_cast<char *>("sbt creation"));
-  PADDLE_ENFORCE_NOT_NULL(sbt);
+  PADDLE_ENFORCE_NOT_NULL(sbt, paddle::platform::errors::ResourceExhausted(
+                                   "Failed to malloc scanbeam table memory."));
  build_sbt(&scanbeam, sbt, sbtree);
  scanbeam = 0;
  free_sbtree(&sbtree);
@@ -1650,7 +1661,8 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
    e1 = aet;

    /* Set up bundle fields of first edge */
-    PADDLE_ENFORCE_NOT_NULL(aet);
+    PADDLE_ENFORCE_NOT_NULL(aet, paddle::platform::errors::InvalidArgument(
+                                     "Edge node AET is nullptr."));
    aet->bundle[ABOVE][aet->type] = (aet->top.y != yb);
    aet->bundle[ABOVE][!aet->type] = 0;
    aet->bstate[ABOVE] = UNBUNDLED;

--- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc
@@ -48,7 +48,9 @@ class FetchBarrierOp : public framework::OperatorBase {
    }

    for (size_t i = 0; i < rets.size(); i++) {
-      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient");
+      PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U,
+                        platform::errors::Unavailable(
+                            "Internal error occurred in RPCClient."));
    }
  }
 };

--- a/paddle/fluid/operators/distributed_ops/send_recv_util.h
+++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h
@@ -34,16 +34,16 @@ inline bool NeedSend(const framework::Scope& scope,
      std::string::npos)
    return false;
  auto* var = scope.FindVar(varname);
-  PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
-                          varname);
+  PADDLE_ENFORCE_NOT_NULL(
+      var, platform::errors::NotFound(
+               "Can not find variable '%s' in the send side.", varname));
  if (var->IsType<framework::LoDTensor>()) {
    return var->Get<framework::LoDTensor>().IsInitialized();
  } else if (var->IsType<framework::SelectedRows>()) {
    return var->Get<framework::SelectedRows>().rows().size() > 0UL;
  } else {
-    PADDLE_THROW(
-        "Variable type in send side should be in "
-        "[LodTensor, SelectedRows]");
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Variable type in send side should be LodTensor or SelectedRows."));
  }
  return false;
 }

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+
 #include <memory>
 #include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"

 namespace paddle {
@@ -129,3 +132,18 @@ REGISTER_OP_CPU_KERNEL(
                                        int>,
    ops::ElementwiseAddDoubleGradKernel<paddle::platform::CPUDeviceContext,
                                        int64_t>);
+
+// A specialization elementwise_add operator, used in gradient accumulation with
+// inplace addto.
+REGISTER_OPERATOR(
+    grad_add, paddle::operators::ElementwiseOp,
+    paddle::operators::ElementwiseAddOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OP_CPU_KERNEL(
+    grad_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -111,3 +111,10 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext, int64_t>,
    ops::ElementwiseAddDoubleGradKernel<plat::CUDADeviceContext,
                                        plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    grad_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, int64_t>,
+    ops::ElementwiseAddKernel<plat::CUDADeviceContext, plat::float16>);
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -174,7 +174,64 @@ struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext, T> {

 template struct ChannelClipAndFakeQuantFunctor<platform::CPUDeviceContext,
                                               float>;
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));

+    auto* scale_data = scale.data<T>();
+    auto* in_data = in.data<T>();
+    auto* out_data = out->mutable_data<T>(ctx.GetPlace());
+    auto in_dims = in.dims();
+    const int64_t channel = in_dims[quant_axis];
+    platform::Transform<platform::CPUDeviceContext> trans;
+    if (quant_axis == 0) {
+      const int64_t channel_size = in.numel() / channel;
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        auto* start = in_data + i * channel_size;
+        auto* end = in_data + (i + 1) * channel_size;
+        trans(ctx, start, end, out_data + i * channel_size,
+              ClipFunctor<T>(-s, s));
+      }
+      for (int i = 0; i < channel; i++) {
+        T s = scale_data[i];
+        T inv_s = inverse(s);
+        framework::Tensor one_channel_out = out->Slice(i, i + 1);
+        auto out_e = framework::EigenVector<T>::Flatten(one_channel_out);
+        out_e.device(*ctx.eigen_device()) =
+            (bin_cnt * inv_s * out_e).round() * s / static_cast<T>(bin_cnt);
+      }
+    } else if (quant_axis == 1) {
+      const int64_t step_i = in.numel() / in_dims[0];
+      const int64_t step_j = in.numel() / (in_dims[0] * in_dims[1]);
+      for (int i = 0; i < in_dims[0]; i++) {
+        for (int j = 0; j < in_dims[1]; j++) {
+          T s = scale_data[j];
+          T inv_s = inverse(s);
+          auto* start = in_data + i * step_i + j * step_j;
+          auto* end = in_data + i * step_i + (j + 1) * step_j;
+          auto* cur_out_data = out_data + i * step_i + j * step_j;
+          trans(ctx, start, end, cur_out_data, ClipFunctor<T>(-s, s));
+          for (int k = 0; k < step_j; k++) {
+            cur_out_data[k] = std::round(bin_cnt * inv_s * cur_out_data[k]) *
+                              s / static_cast<T>(bin_cnt);
+          }
+        }
+      }
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CPUDeviceContext,
+                                                   float>;
 template <typename T>
 struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,
@@ -360,6 +417,75 @@ $$0 \leq c \lt \ the\ channel\ number\ of\ X$$
  }
 };

+class FakeChannelWiseQuantizeDequantizeAbsMaxOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale",
+                   "FakeChannelWiseQuantizeDequantizeAbsMax");
+    int quant_axis = ctx->Attrs().Get<int>("quant_axis");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {ctx->GetInputDim("X")[quant_axis]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddOutput("Out",
+              "(Tensor) Output of quantized and dequantized low level tensor, "
+              "saved as float data type.");
+    AddOutput("OutScale", "(Tensor) Current channel wise scale");
+    AddAttr<int>("quant_axis",
+                 "(int, default 0) The axis for quantization. "
+                 "For conv2d, depthwise_conv2d, conv2d_transpose "
+                 "and mul, the quant_axis is equal to the cout axis.")
+        .SetDefault(0)
+        .AddCustomChecker([](const int& quant_axis) {
+          PADDLE_ENFORCE_EQ(quant_axis == 0 || quant_axis == 1, true,
+                            platform::errors::InvalidArgument(
+                                "'quant_axis' should be 0 or 1, but "
+                                "the received is %d",
+                                quant_axis));
+        });
+    AddAttr<int>("bit_length", "(int, default 8)")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE_EQ(bit_length >= 1 && bit_length <= 16, true,
+                            platform::errors::InvalidArgument(
+                                "'bit_length' should be between 1 and 16, but "
+                                "the received is %d",
+                                bit_length));
+        });
+    AddComment(R"DOC(
+The scale of FakeChannelWiseQuantize operator is a vector.
+In detail, each channel of the input X has a scale value.
+
+$$scale_c = max(abs(X_c))$$
+$$range = 2^{bit\_length - 1} - 1$$
+$$Out_c = round(\frac{X_c * range} {scale_c}) * \frac{scale_c} {range}$$
+In above three formulas, the range value of c is as follow:
+$$0 \leq c \lt \ the\ channel\ number\ of\ X$$
+)DOC");
+  }
+};
+
 class FakeQuantizeRangeAbsMaxOp : public framework::OperatorWithKernel {
 public:
  FakeQuantizeRangeAbsMaxOp(const std::string& type,
@@ -666,3 +792,12 @@ REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale,
 REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp);
 REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad,
                       ops::FakeQuantDequantGradKernel<CPU, float>);
+
+REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp,
+                  ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker,
+                  ops::FakeQuantDequantGradMaker<paddle::framework::OpDesc>,
+                  ops::FakeQuantDequantGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CPU, float>);
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -417,8 +417,90 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
  }
 };

-template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
-                                               float>;
+// ChannelClipAndQuantDequantKernel for quant_axis is 0
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis0(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int c,
+    T* out) {
+  int tid = threadIdx.x;
+
+  int channel_size = n / c;
+  const T* in_c = in + blockIdx.x * channel_size;
+  T* out_c = out + blockIdx.x * channel_size;
+
+  T s = scale[blockIdx.x];
+  T inv_s = inverse(s);
+
+  for (int i = tid; i < channel_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+// ChannelClipAndQuantDequantKernel for quant_axis is 1
+template <typename T>
+__global__ void ChannelClipAndQuantDequantKernelQuantAxis1(
+    const T* in, const T* scale, const int bin_cnt, const int n, const int cin,
+    const int cout, T* out) {
+  T s = scale[blockIdx.x % cout];
+  T inv_s = inverse(s);
+
+  int wh_size = n / (cin * cout);
+  const T* in_c = in + blockIdx.x * wh_size;
+  T* out_c = out + blockIdx.x * wh_size;
+
+  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
+    T x = in_c[i];
+    T v = x > s ? s : x;
+    v = v < -s ? -s : v;
+    v = bin_cnt * inv_s * v;
+    out_c[i] = round(v) * s / bin_cnt;
+  }
+}
+
+template <typename T>
+struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in, const framework::Tensor& scale,
+                  const int bin_cnt, const int quant_axis,
+                  framework::Tensor* out) {
+    // At present, channelwise quantization supports conv2d, depthwise_conv2d
+    // conv2d_transpose and mul
+    PADDLE_ENFORCE_EQ(
+        quant_axis == 0 || quant_axis == 1, true,
+        platform::errors::InvalidArgument("'quant_axis' should be 0 or 1, but "
+                                          "the received is %d",
+                                          quant_axis));
+
+    int num = in.numel();
+    auto in_dims = in.dims();
+
+    const T* in_data = in.data<T>();
+    const T* scale_data = scale.data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+
+    if (quant_axis == 0) {
+      int grid = in_dims[0];
+      int block = 1024;
+      ChannelClipAndQuantDequantKernelQuantAxis0<
+          T><<<grid, block, 0, ctx.stream()>>>(in_data, scale_data, bin_cnt,
+                                               num, in_dims[0], out_data);
+    } else if (quant_axis == 1) {
+      int grid = in_dims[0] * in_dims[1];
+      int block = 1024;
+
+      ChannelClipAndQuantDequantKernelQuantAxis1<
+          T><<<grid, block, 0, ctx.stream()>>>(
+          in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
+    }
+  }
+};
+
+template struct ChannelClipFakeQuantDequantFunctor<platform::CUDADeviceContext,
+                                                   float>;

 }  // namespace operators
 }  // namespace paddle
@@ -443,3 +525,6 @@ REGISTER_OP_CUDA_KERNEL(
    ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad,
                        ops::FakeQuantDequantGradKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_channel_wise_quantize_dequantize_abs_max,
+    ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel<CUDA, float>);
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -72,6 +72,13 @@ struct ChannelClipAndFakeQuantFunctor {
                  const int quant_axis, framework::Tensor* out);
 };

+template <typename DeviceContext, typename T>
+struct ChannelClipFakeQuantDequantFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in,
+                  const framework::Tensor& scale, const int bin_cnt,
+                  const int quant_axis, framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 struct FindMovingAverageAbsMaxFunctor {
  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
@@ -154,6 +161,30 @@ class FakeChannelWiseQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  }
 };

+template <typename DeviceContext, typename T>
+class FakeChannelWiseQuantizeDequantizeAbsMaxKernel
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    T* out_scale_data = out_scale->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    int quant_axis = context.Attr<int>("quant_axis");
+
+    FindChannelAbsMaxFunctor<DeviceContext, T>()(dev_ctx, *in, quant_axis,
+                                                 out_scale_data);
+
+    ChannelClipFakeQuantDequantFunctor<DeviceContext, T>()(
+        dev_ctx, *in, *out_scale, bin_cnt, quant_axis, out);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -8,7 +8,8 @@ register_operators(EXCLUDES
    multihead_matmul_op
    fused_embedding_eltwise_layernorm_op
    fusion_group_op
-    fusion_gru_op)
+    fusion_gru_op
+    fused_bn_add_activation_op)

 # fusion_gru_op does not have CUDA kernel
 op_library(fusion_gru_op)
@@ -47,4 +48,9 @@ if (WITH_GPU)
        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_group);\n")
        cc_test(test_fusion_group_op SRCS fusion_group_op_test.cc DEPS fusion_group_op)
    endif()
+    # fused_bn_add_activation
+    if (NOT ${CUDNN_VERSION} VERSION_LESS 7401)
+    op_library(fused_bn_add_activation_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_bn_add_activation);\n")
+    endif()
 endif()
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+void FusedBatchNormAddActOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasInput("Bias"), "Input", "Bias",
+                 "FusedBatchNormAddActOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("MeanOut"), "Output", "MeanOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("VarianceOut"), "Output", "VarianceOut",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedMean"), "Output", "SavedMean",
+                 "FusedBatchNormAddActOp");
+  OP_INOUT_CHECK(ctx->HasOutput("SavedVariance"), "Output", "SavedVariance",
+                 "FusedBatchNormAddActOp");
+
+  const auto x_dims = ctx->GetInputDim("X");
+  const auto z_dims = ctx->GetInputDim("Z");
+  PADDLE_ENFORCE_EQ(x_dims, z_dims,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the shapes of input "
+                        "must be equal. But received: the shape "
+                        "of input X = [%s], and the shape of "
+                        "input Y = [%s]",
+                        x_dims, z_dims));
+  PADDLE_ENFORCE_GE(x_dims.size(), 2, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must greater than or equal to 2."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(), 5, platform::errors::InvalidArgument(
+                                          "ShapeError: the dimensions of input "
+                                          "must smaller than or equal to 5."
+                                          "But received: the shape of input "
+                                          "= [%s], the dimension of input = "
+                                          "[%d]",
+                                          x_dims, x_dims.size()));
+
+  const int64_t C = x_dims[x_dims.size() - 1];
+
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
+
+  PADDLE_ENFORCE_EQ(
+      scale_dim.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "ShapeError: the dimension of scale must equal to 1."
+          "But received: the shape of scale is [%s], the dimension "
+          "of scale is [%d]",
+          scale_dim, scale_dim.size()));
+  PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL,
+                    platform::errors::InvalidArgument(
+                        "ShapeError: the dimension of bias must equal to 1."
+                        "But received: the shape of bias is [%s],the dimension "
+                        "of bias is [%d]",
+                        bias_dim, bias_dim.size()));
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of scale must equal to [%d]"
+                          "But received: the shape of scale is [%d]",
+                          C, scale_dim[0]));
+    PADDLE_ENFORCE_EQ(bias_dim[0], C,
+                      platform::errors::InvalidArgument(
+                          "ShapeError: the shape of bias must equal to [%d]"
+                          "But received: the shape of bias is [%d]",
+                          C, bias_dim[0]));
+  }
+  ctx->SetOutputDim("Y", x_dims);
+  ctx->SetOutputDim("MeanOut", {C});
+  ctx->SetOutputDim("VarianceOut", {C});
+  ctx->SetOutputDim("SavedMean", {C});
+  ctx->SetOutputDim("SavedVariance", {C});
+  ctx->ShareLoD("X", "Y");
+}
+
+framework::OpKernelType FusedBatchNormAddActOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
+  // By default, the type of the scale, bias, mean,
+  // and var tensors should be float when input tensor's dtype is float16.
+  auto bn_param_type = framework::proto::VarType::FP32;
+
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Scale")->type(),
+      platform::errors::InvalidArgument("Scale input should be of float type"));
+  PADDLE_ENFORCE_EQ(
+      bn_param_type, ctx.Input<Tensor>("Bias")->type(),
+      platform::errors::InvalidArgument("Bias input should be of float type"));
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                 library);
+}
+
+void FusedBatchNormAddActOpMaker::Make() {
+  AddInput("X", "The input tensor");
+  AddInput("Z", "The input tensor");
+  AddInput("Scale",
+           "Scale is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddInput("Bias",
+           "Bias is a 1-dimensional tensor of size C "
+           "that is applied to the output");
+  AddOutput("Y", "result after normalization");
+  AddOutput("MeanOut",
+            "Share memory with Mean. "
+            "Store the global mean when training");
+  AddOutput("VarianceOut",
+            "Share memory with Variance. "
+            "Store the global Variance when training");
+  AddOutput("SavedMean",
+            "Mean of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("SavedVariance",
+            "Variance of the current mini batch, "
+            "will apply to output when training")
+      .AsIntermediate();
+  AddOutput("ReserveSpace",
+            "Reserve GPU space for triggering the new semi-persistent "
+            "NHWC kernel");
+  AddAttr<float>("momentum", "").SetDefault(0.9);
+  AddAttr<float>("epsilon", "")
+      .SetDefault(1e-5)
+      .AddCustomChecker([](const float &epsilon) {
+        PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true,
+                          platform::errors::InvalidArgument(
+                              "'epsilon' should be between 0.0 and 0.001."));
+      });
+  AddAttr<std::string>("act_type", "The activation type to be fused.")
+      .SetDefault("relu");
+  AddComment(R"DOC(
+Fused Batch Normalization with activation.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Batch Norm can be used as a normalizer function for conv2d and fully_connected operations.
+Now, the required data format for FusedBatchNormAddActOp is NHWC `[batch, in_height, in_width, in_channels]`.
+
+)DOC");
+}
+
+void FusedBatchNormAddActGradOp::InferShape(
+    framework::InferShapeContext *ctx) const {
+  // check input
+  OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput("SavedVariance"), "Input", "SavedVariance",
+                 "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input",
+                 framework::GradVarName("Y"), "FusedBatchNormAddActGradOp");
+
+  // check output
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                 framework::GradVarName("X"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output",
+                 framework::GradVarName("Z"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Scale")), "Output",
+                 framework::GradVarName("Scale"), "FusedBatchNormAddActGradOp");
+  OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Bias")), "Output",
+                 framework::GradVarName("Bias"), "FusedBatchNormAddActGradOp");
+
+  const auto in_dims = ctx->GetInputDim("X");
+  const int C = in_dims[in_dims.size() - 1];
+
+  ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Z"), in_dims);
+  ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+  ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+}
+
+framework::OpKernelType FusedBatchNormAddActGradOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+  if (var == nullptr) {
+    PADDLE_THROW(platform::errors::NotFound(
+        "Can not find Y@GRAD in the execution context."));
+  }
+  const Tensor *t = nullptr;
+  if (var->IsType<Tensor>()) {
+    t = &var->Get<Tensor>();
+  } else if (var->IsType<LoDTensor>()) {
+    t = &var->Get<LoDTensor>();
+  }
+  if (t == nullptr) {
+    PADDLE_THROW(
+        platform::errors::NotFound("Can not get the tensor value of Y@GRAD."));
+  }
+
+  framework::LibraryType library = framework::LibraryType::kPlain;
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+
+  return framework::OpKernelType(
+      OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout,
+      library);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_bn_add_activation, ops::FusedBatchNormAddActOp,
+    ops::FusedBatchNormAddActOpMaker, ops::FusedBatchNormAddActOpInferVarType,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedBatchNormAddActGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_bn_add_activation_grad,
+                  ops::FusedBatchNormAddActGradOp);
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/float16.h"
+
+DECLARE_bool(cudnn_batchnorm_spatial_persistent);
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+template <typename T>
+using BatchNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+
+template <typename T>
+class FusedBatchNormAddActKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    float momentum = ctx.Attr<float>("momentum");
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    // Get the size for each dimension.
+    // NHWC [batch_size, in_height, in_width, in_channels]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto &in_dims = x->dims();
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+    auto *y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    // ------------------- cudnn descriptors ---------------------
+    auto handle = dev_ctx.cudnn_handle();
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * D * C, 1, W * D * C, D * C, C};
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    double this_factor = 1. - momentum;
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    size_t workspace_size = 0;
+    size_t reserve_space_size = 0;
+    void *reserve_space_ptr = nullptr;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    // Create reserve space and workspace for batch norm.
+    // Create tensor for each batchnorm op, it will be used in the
+    // backward. Thus this tensor shouldn't be temp.
+    auto *reserve_space = ctx.Output<Tensor>("ReserveSpace");
+    PADDLE_ENFORCE_NOT_NULL(
+        reserve_space,
+        platform::errors::NotFound(
+            "The argument ReserveSpace of batch_norm op is not found."));
+
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::
+            cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+                /*handle=*/handle,
+                /*mode=*/mode_,
+                /*bnOps=*/bnOps_,
+                /*xDesc=*/data_desc_,
+                /*zDesc=*/data_desc_,
+                /*yDesc=*/data_desc_,
+                /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+                /*activationDesc=*/activation_desc_,
+                /*sizeInBytes=*/&workspace_size));
+
+    // -------------- cudnn batchnorm reserve space --------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+            /*handle=*/handle,
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*activationDesc=*/activation_desc_,
+            /*xDesc=*/data_desc_,
+            /*sizeInBytes=*/&reserve_space_size));
+
+    reserve_space_ptr = reserve_space->mutable_data(ctx.GetPlace(), x->type(),
+                                                    reserve_space_size);
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationForwardTrainingEx(
+            handle, mode_, bnOps_, CudnnDataType<T>::kOne(),
+            CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+            data_desc_, z->template data<T>(), data_desc_,
+            y->template data<T>(), bn_param_desc_,
+            scale->template data<BatchNormParamType<T>>(),
+            bias->template data<BatchNormParamType<T>>(), this_factor,
+            mean_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            variance_out->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            epsilon, saved_mean->template mutable_data<BatchNormParamType<T>>(
+                         ctx.GetPlace()),
+            saved_variance->template mutable_data<BatchNormParamType<T>>(
+                ctx.GetPlace()),
+            activation_desc_, workspace_ptr, workspace_size, reserve_space_ptr,
+            reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        platform::is_gpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("It must use CUDAPlace."));
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    std::string act_type = ctx.Attr<std::string>("act_type");
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *z = ctx.Input<Tensor>("Z");
+    const auto *y = ctx.Input<Tensor>("Y");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    const auto &in_dims = x->dims();
+
+    int N, C, H, W, D;
+    const DataLayout data_layout = DataLayout::kNHWC;
+    ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_z = ctx.Output<Tensor>(framework::GradVarName("Z"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_z->mutable_data<T>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(
+        d_scale && d_bias, true,
+        platform::errors::PreconditionNotMet(
+            "Both the scale grad and the bias grad must not be null."));
+    d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL,
+                      platform::errors::PreconditionNotMet(
+                          "The scale only has one dimension."));
+    PADDLE_ENFORCE_EQ(
+        scale->dims()[0], C,
+        platform::errors::PreconditionNotMet(
+            "The size of scale is equal to the channel of Input(X)."));
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    std::vector<int> dims = {N, C, H, W, D};
+    std::vector<int> strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        in_dims.size() > 3 ? in_dims.size() : 4, dims.data(), strides.data()));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_,
+                                                         data_desc_, mode_));
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const auto *saved_mean_data =
+        saved_mean->template data<BatchNormParamType<T>>();
+    const auto *saved_var_data =
+        saved_var->template data<BatchNormParamType<T>>();
+
+    size_t workspace_size = 0;
+    void *workspace_ptr = nullptr;
+    Tensor workspace_tensor;
+    auto reserve_space_size = reserve_space->memory_size();
+    cudnnBatchNormOps_t bnOps_ = CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION;
+    platform::ScopedActivationDescriptor scope_act_desc;
+    cudnnActivationDescriptor_t activation_desc_ =
+        scope_act_desc.descriptor<T>(act_type);
+    // --------------- cudnn batchnorm workspace ---------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*xDesc=*/data_desc_,
+            /*yDesc=*/data_desc_,
+            /*dyDesc=*/data_desc_,
+            /*dzDesc=*/data_desc_,
+            /*dxDesc=*/data_desc_,
+            /*bnScaleBiasMeanVarDesc=*/bn_param_desc_,
+            /*activationDesc=*/activation_desc_,
+            /*sizeInBytes=*/&workspace_size));
+
+    workspace_ptr = workspace_tensor.mutable_data(ctx.GetPlace(), x->type(),
+                                                  workspace_size);
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnBatchNormalizationBackwardEx(
+            /*handle=*/dev_ctx.cudnn_handle(),
+            /*mode=*/mode_,
+            /*bnOps=*/bnOps_,
+            /*alphaDataDiff=*/CudnnDataType<T>::kOne(),
+            /*betaDataDiff=*/CudnnDataType<T>::kZero(),
+            /*alphaParamDiff=*/CudnnDataType<T>::kOne(),
+            /*betaParamDiff=*/CudnnDataType<T>::kZero(),
+            /*xDesc=*/data_desc_,
+            /*xData=*/x->template data<T>(),
+            /*yDesc=*/data_desc_,
+            /*yData=*/y->template data<T>(),
+            /*dyDesc=*/data_desc_,
+            /*dyData=*/d_y->template data<T>(),
+            /*dzDesc=*/data_desc_,
+            /*dzData=*/d_z->template data<T>(),
+            /*dxDesc=*/data_desc_,
+            /*dxData=*/d_x->template data<T>(),
+            /*dBnScaleBiasDesc=*/bn_param_desc_,
+            /*bnScaleData=*/scale->template data<BatchNormParamType<T>>(),
+            /*bnBiasData=*/bias->template data<BatchNormParamType<T>>(),
+            /*dBnScaleData=*/d_scale->template data<BatchNormParamType<T>>(),
+            /*dBnBiasData=*/d_bias->template data<BatchNormParamType<T>>(),
+            /*epsilon=*/epsilon,
+            /*savedMean=*/saved_mean_data,
+            /*savedInvVariance=*/saved_var_data,
+            /*activationDesmc=*/activation_desc_,
+            /*workspace=*/workspace_ptr,
+            /*workSpaceSizeInBytes=*/workspace_size,
+            /*reserveSpace=*/const_cast<T *>(reserve_space->template data<T>()),
+            /*reserveSpaceSizeInBytes=*/reserve_space_size));
+
+    // clean when exit.
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#if CUDNN_VERSION >= 7401
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    fused_bn_add_activation,
+    ops::FusedBatchNormAddActKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(fused_bn_add_activation_grad,
+                        ops::FusedBatchNormAddActGradKernel<
+                            plat::CUDADeviceContext, plat::float16>);
+#endif
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/grad_op_desc_maker.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+class FusedBatchNormAddActOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedBatchNormAddActOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+template <typename T>
+class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType(this->ForwardOpType() + "_grad");
+    op->SetInput("X", this->Input("X"));
+    op->SetInput("Z", this->Input("Z"));
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+
+    op->SetInput("Scale", this->Input("Scale"));
+    op->SetInput("Bias", this->Input("Bias"));
+    op->SetInput("SavedMean", this->Output("SavedMean"));
+    op->SetInput("SavedVariance", this->Output("SavedVariance"));
+    op->SetInput("ReserveSpace", this->Output("ReserveSpace"));
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z"));
+    op->SetOutput(framework::GradVarName("Scale"), this->InputGrad("Scale"));
+    op->SetOutput(framework::GradVarName("Bias"), this->InputGrad("Bias"));
+  }
+};
+
+class FusedBatchNormAddActOpInferVarType
+    : public framework::PassInDtypeAndVarTypeToOutput {
+ protected:
+  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
+      const override {
+    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Y"}};
+    return m;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class FusedBatchNormAddActGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_gru_op.h"
 #include <cstring>  // for memcpy
 #include <string>
+#include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"

--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -192,6 +192,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
          copy_size += src_mat_w_sz;
        }
        // fill data
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
        for (int j = 0; j < seq_len - up_pad - down_pad; ++j) {
          std::memcpy(dst_data, src_data, copy_size);
          dst_data += col_mat_w;
@@ -201,18 +204,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
        std::memset(dst_data, 0, down_pad * col_mat_w_sz);
        copy_size -= src_mat_w_sz;
        for (int j = 0; j < down_pad; ++j) {
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
          std::memcpy(dst_data, src_data, copy_size);
          dst_data += col_mat_w;
          src_data += src_mat_w;
          copy_size -= src_mat_w_sz;
        }
      } else {
-        PADDLE_ENFORCE_GE(context_length, up_pad + down_pad + 1,
-                          platform::errors::InvalidArgument(
-                              "context length must be bigger or equal than "
-                              "up_pad + down_pad + 1, but received context "
-                              "length is: %d, up_pad is: %d, down_pad is: %d.",
-                              context_length, up_pad, down_pad));
        std::memset(dst_data, 0, seq_len * col_mat_w_sz);
        dst_data = dst_data + up_pad * src_mat_w;
        int zero_sz = up_pad * src_mat_w_sz;
@@ -226,9 +226,15 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel<T> {
        // from bottom
        dst_data = col_data + ed * col_mat_w;
        src_data = x_data + st * src_mat_w;
+        if (context_start > 0) {
+          src_data += context_start * src_mat_w;
+        }
        zero_sz = down_pad * src_mat_w_sz;
        for (int j = 1; j <= std::min(down_pad, seq_len); ++j) {
          int copy_size = std::min(cur_src_sz, col_mat_w_sz - zero_sz);
+          if (copy_size < 0) {
+            copy_size = 0;
+          }
          std::memcpy(dst_data - (zero_sz + copy_size) / sizeof(T),
                      src_data + std::max(seq_len - j - up_pad, 0) * src_mat_w,
                      copy_size);

--- a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc
@@ -24,20 +24,27 @@ void FusionSeqPoolCVMConcatOp::InferShape(
    framework::InferShapeContext* ctx) const {
  PADDLE_ENFORCE_GE(
      ctx->Inputs("X").size(), 1UL,
-      "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Output(Out) of FusionSeqPoolCVMConcatOp should not be null.");
+      paddle::platform::errors::InvalidArgument(
+          "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty."));
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      paddle::platform::errors::InvalidArgument(
+          "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."));
  int axis = ctx->Attrs().Get<int>("axis");
  PADDLE_ENFORCE_EQ(
-      axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet.");
+      axis, 1,
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet."));
  bool use_cvm = ctx->Attrs().Get<bool>("use_cvm");
  PADDLE_ENFORCE_EQ(
      use_cvm, true,
-      "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet.");
+      paddle::platform::errors::InvalidArgument(
+          "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet."));

  auto ins_dims = ctx->GetInputsDim("X");
  const size_t n = ins_dims.size();
-  PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0.");
+  PADDLE_ENFORCE_GT(n, 0UL, paddle::platform::errors::InvalidArgument(
+                                "Input tensors count should > 0."));
  if (n == 1) {
    LOG(WARNING) << "Only have one input, may waste memory";
  }
@@ -45,7 +52,8 @@ void FusionSeqPoolCVMConcatOp::InferShape(
  // The output height should be confirmed in Compute,
  // since input lod is not accessible here.
  PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
-                    "The dims size of first input should be 2.");
+                    paddle::platform::errors::InvalidArgument(
+                        "The dims size of first input should be 2."));
  ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
 }

@@ -99,7 +107,8 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {

    int w = ins[0]->numel() / x0_dims[0];
    PADDLE_ENFORCE_EQ(y_dims[1] % w, 0,
-                      "The output of dims[1] should be dividable of w");
+                      paddle::platform::errors::InvalidArgument(
+                          "The output of dims[1] should be dividable of w"));
    jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum);
    if (pooltype == "AVERAGE") {
      attr.type = jit::SeqPoolType::kAvg;
@@ -117,9 +126,11 @@ class FusionSeqPoolCVMConcatKernel : public framework::OpKernel<T> {
      const T* src = ins[i]->data<T>();
      T* dst = y_data + i * w;
      PADDLE_ENFORCE_EQ(static_cast<int>(ins[i]->numel() / x_dims[0]), w,
-                        "Width of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Width of all inputs should be equal."));
      PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1,
-                        "Batchsize of all inputs should be equal.");
+                        paddle::platform::errors::InvalidArgument(
+                            "Batchsize of all inputs should be equal."));
      for (size_t j = 0; j < bs; ++j) {
        attr.h = static_cast<int>(x_lod[j + 1] - x_lod[j]);
        seqpool(src, dst, &attr);

--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -47,7 +47,9 @@ class GRUUnitKernel : public framework::OpKernel<T> {
    else if (act_type == relu)
      ReluFunctor<T>()(d, x, y);
    else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
  }

  void Compute(const framework::ExecutionContext& context) const override {
@@ -137,7 +139,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    else if (act_type == relu)
      ReluGradFunctor<T>()(d, x, y, dy, dx);
    else
-      PADDLE_THROW("unsupported activation type");
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported activation type, only supports identity, sigmoid, tanh "
+          "and relu."));
  }

  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -595,9 +595,13 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>

        first_grad_arr +=
            inv_var_tile_data *
-            (dy_arr - dy_arr.colwise().sum() / sample_size -
+            (dy_arr -
+             dy_arr.colwise().sum().replicate(sample_size, 1) / sample_size -
             x_sub_mean_mul_invstd_arr *
-                 (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() /
+                 (dy_arr * x_sub_mean_mul_invstd_arr)
+                     .colwise()
+                     .sum()
+                     .replicate(sample_size, 1) /
                 sample_size);
        first_grad_arr = first_grad_arr * ddx_arr;
        for (int nc = 0; nc < NxC; ++nc) {

--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -104,12 +104,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
  auto dim_x = ctx->GetInputDim("X");
  auto interp_method = ctx->Attrs().Get<std::string>("interp_method");

-  PADDLE_ENFORCE(
-      "bilinear" == interp_method || "nearest" == interp_method ||
-          "bicubic" == interp_method,
-      "Interpolation method can only be \"bilinear\" or \"nearest\" when "
-      "Input(X) dimension is 4, but got method = %s .",
-      interp_method);
+  PADDLE_ENFORCE_EQ("bilinear" == interp_method || "nearest" == interp_method ||
+                        "bicubic" == interp_method,
+                    true, platform::errors::InvalidArgument(
+                              "Interpolation method can only be \"bilinear\" "
+                              "or \"nearest\" or \"bicubic\" when "
+                              "Input(X) dimension is 4, but got method is %s.",
+                              interp_method));
  const DataLayout data_layout = framework::StringToDataLayout(
      ctx->Attrs().Get<std::string>("data_layout"));

@@ -169,13 +170,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) {
    auto out_size_dim = ctx->GetInputDim("OutSize");
    PADDLE_ENFORCE_EQ(
        out_size_dim.size(), 1,
-        platform::errors::InvalidArgument(
-            "OutSize's dimension size must be 1, but got dimension = %d .",
-            out_size_dim.size()));
+        platform::errors::InvalidArgument("OutSize's dimension size must be 1, "
+                                          "but got dimension size is %d .",
+                                          out_size_dim.size()));
    PADDLE_ENFORCE_EQ(
        out_size_dim[0], 2,
        platform::errors::InvalidArgument(
-            "OutSize's dim[0] must be 2, but got dimention = %d .",
+            "OutSize's dimension[0] must be 2, but got dimension[0] is %d .",
            out_size_dim[0]));
    ctx->ShareLoD("X", "Out");
    return;
@@ -264,12 +265,15 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) {

  if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
    auto out_size_dim = ctx->GetInputDim("OutSize");
-    PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
-                      "OutSize's dimension size must be 1, but got size =%d .",
-                      out_size_dim.size());
+    PADDLE_ENFORCE_EQ(
+        out_size_dim.size(), 1,
+        platform::errors::InvalidArgument(
+            "OutSize's dimension size must be 1, but got size is %d.",
+            out_size_dim.size()));
    PADDLE_ENFORCE_EQ(out_size_dim[0], 3,
-                      "OutSize's dim[0] must be 3, but got size = %d .",
-                      out_size_dim[0]);
+                      platform::errors::InvalidArgument(
+                          "OutSize's dim[0] must be 3, but got size is %d.",
+                          out_size_dim[0]));
    ctx->ShareLoD("X", "Out");
    return;
  }
@@ -289,10 +293,8 @@ class InterpolateOp : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of InterpolateOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of InterpolationOp should not be null.");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Interpolate");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Interpolate");

    auto dim_x = ctx->GetInputDim("X");  // NCHW format
    PADDLE_ENFORCE(
@@ -534,9 +536,10 @@ class InterpolateOpGrad : public framework::OperatorWithKernel {

 protected:
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "InterpolateGrad");
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "InterpolateGrad");
+
    auto dim_x = ctx->GetInputDim("X");
    if (ctx->HasOutput(framework::GradVarName("X"))) {
      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);

--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/linspace_op.h"
+#include <string>

 namespace paddle {
 namespace operators {
@@ -21,7 +22,7 @@ class LinspaceOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace");
    OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace");
    OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace");
@@ -50,11 +51,17 @@ class LinspaceOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
    return framework::OpKernelType(
        framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
        ctx.GetPlace());
  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };

 class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {

--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -23,9 +23,16 @@ namespace operators {
 using Tensor = framework::Tensor;

 template <typename T>
-__global__ void LinspaceKernel(T start, double step, int64_t size, T* out) {
-  CUDA_KERNEL_LOOP(index, size) {
-    out[index] = static_cast<T>(start + step * index);
+__global__ void LinspaceKernel(T start, T stop, double step, int64_t size,
+                               T* out) {
+  int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (; index < size; index += blockDim.x * gridDim.x) {
+    if (index < size / 2) {
+      out[index] = static_cast<T>(start + step * index);
+    } else {
+      out[index] = static_cast<T>(stop - step * (size - index - 1));
+    }
  }
 }

@@ -55,13 +62,15 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
    framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t);
    framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t);

-    framework::Tensor n;
-    framework::TensorCopy(start_t, platform::CPUPlace(), &n);
-    T start = n.data<T>()[0];
-    framework::TensorCopy(stop_t, platform::CPUPlace(), &n);
-    T stop = n.data<T>()[0];
-    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
-    int32_t num = n.data<int32_t>()[0];
+    framework::Tensor n_start;
+    framework::Tensor n_stop;
+    framework::Tensor n_num;
+    framework::TensorCopy(start_t, platform::CPUPlace(), &n_start);
+    T start = n_start.data<T>()[0];
+    framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop);
+    T stop = n_stop.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num);
+    int64_t num = static_cast<int64_t>(n_num.data<int32_t>()[0]);

    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
                                  "The num of linspace op should be larger "
@@ -72,14 +81,16 @@ class CUDALinspaceKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(context.GetPlace());

    double step = 0;
-    if (num != 1) {
-      step = (static_cast<double>(stop - start)) / (num - 1);
-    }
-
    auto stream = context.cuda_device_context().stream();
    int block = 512;
    int grid = (num + block - 1) / block;
-    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+    if (num != 1) {
+      step = (static_cast<double>(stop - start)) / (num - 1);
+      LinspaceKernel<T><<<grid, block, 0, stream>>>(start, stop, step, num,
+                                                    out_data);
+    } else {
+      LinspaceSpecialKernel<T><<<grid, block, 0, stream>>>(start, out_data);
+    }
  }
 };


--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -56,9 +56,15 @@ class CPULinspaceKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(context.GetPlace());

    if (num > 1) {
+      // step should be of double type for all types
      double step = (static_cast<double>(stop - start)) / (num - 1);
+      int half_num = num / 2;
      for (int i = 0; i < num; ++i) {
-        out_data[i] = static_cast<T>(start + step * i);
+        if (i < half_num) {
+          out_data[i] = static_cast<T>(start + step * i);
+        } else {
+          out_data[i] = static_cast<T>(stop - step * (num - i - 1));
+        }
      }
    } else {
      out_data[0] = static_cast<T>(start);

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -76,6 +76,7 @@ math_library(prelu)
 math_library(bert_encoder_functor)
 math_library(tree2col DEPS math_function)
 math_library(matrix_inverse)
+math_library(segment_pooling)

 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)

--- a/paddle/fluid/operators/math/segment_pooling.cc
+++ b/paddle/fluid/operators/math/segment_pooling.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/segment_pooling.h"
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename IndexT>
+class SegmentPoolFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* index,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = input.numel() / input.dims()[0];
+    auto& place = *context.eigen_device();
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_t = output->Slice(curent_id, curent_id + 1);
+      Tensor in_t = input.Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_e =
+          framework::EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = framework::EigenVector<T>::Flatten(out_t);
+
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      if (pooltype == "MEAN") {
+        out_e.device(place) = in_e.mean(reduce_dim);
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(reduce_dim);
+      } else if (pooltype == "MAX") {
+        out_e.device(place) = in_e.maximum(reduce_dim);
+      } else if (pooltype == "MIN") {
+        out_e.device(place) = in_e.minimum(reduce_dim);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+template <typename T, typename IndexT>
+class SegmentPoolGradFunctor<platform::CPUDeviceContext, T, IndexT> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* index = nullptr,
+                  const std::string pooltype = "SUM") {
+    const IndexT* segment_ids = segments.data<IndexT>();
+    auto& place = *context.eigen_device();
+    auto curent_id = segment_ids[0];
+    int64_t last_idx = 0;
+    int64_t w = in_grad->numel() / in_grad->dims()[0];
+    for (int64_t idx = 1; idx <= segments.numel(); ++idx) {
+      if (idx < segments.numel()) {
+        if (segment_ids[idx] == curent_id) continue;
+        PADDLE_ENFORCE_GE(segment_ids[idx], curent_id,
+                          platform::errors::InvalidArgument(
+                              "The segment ids should be sorted, but got "
+                              "segment_ids[%d]:%d > segment_ids[%d]:%d.",
+                              idx - 1, curent_id, idx, segment_ids[idx]));
+      }
+
+      Tensor out_g_t = out_grad.Slice(curent_id, curent_id + 1);
+      Tensor in_g_t = in_grad->Slice(last_idx, idx);
+
+      int64_t h = idx - last_idx;
+      auto in_g_e = framework::EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = framework::EigenMatrix<T>::From(out_g_t, {1, w});
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "MEAN") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = out_g_e.broadcast(bcast);
+      } else if (pooltype == "MAX" || pooltype == "MIN") {
+        Tensor out_t = output.Slice(curent_id, curent_id + 1);
+        Tensor in_t = input.Slice(last_idx, idx);
+        auto in_e = framework::EigenMatrix<T>::From(in_t, {h, w});
+        auto out_e = framework::EigenMatrix<T>::From(out_t, {1, w});
+        in_g_e.device(place) =
+            (in_e == out_e.broadcast(bcast)).template cast<T>() *
+            out_g_e.broadcast(bcast);
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Unsupported segment pooling type, only MEAN, SUM, MAX, MIN "
+            "available, but got %s.",
+            pooltype));
+      }
+
+      last_idx = idx;
+      if (idx < segments.numel()) curent_id = segment_ids[idx];
+    }
+  }
+};
+
+using CPU = platform::CPUDeviceContext;
+template class SegmentPoolFunctor<CPU, float, int>;
+template class SegmentPoolFunctor<CPU, float, int64_t>;
+template class SegmentPoolFunctor<CPU, double, int>;
+template class SegmentPoolFunctor<CPU, double, int64_t>;
+template class SegmentPoolGradFunctor<CPU, float, int>;
+template class SegmentPoolGradFunctor<CPU, float, int64_t>;
+template class SegmentPoolGradFunctor<CPU, double, int>;
+template class SegmentPoolGradFunctor<CPU, double, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/segment_pooling.h
+++ b/paddle/fluid/operators/math/segment_pooling.h
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& segments, framework::Tensor* output,
+                  framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+template <typename DeviceContext, typename T, typename IndexT>
+class SegmentPoolGradFunctor {
+ public:
+  /* mean pool has summed_ids output */
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& segments, framework::Tensor* in_grad,
+                  const framework::Tensor* summed_ids = nullptr,
+                  const std::string pooltype = "SUM");
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -44,8 +44,10 @@ class MergeLoDTensorOp : public framework::OperatorBase {
        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
    auto level = static_cast<size_t>(Attr<int>("level"));

-    PADDLE_ENFORCE(in_true.numel() || in_false.numel(),
-                   "Input(InTrue) or Input(InFalse) should be initialized.");
+    PADDLE_ENFORCE_EQ(
+        in_true.numel() || in_false.numel(), true,
+        platform::errors::InvalidArgument(
+            "Input(InTrue) or Input(InFalse) should be initialized."));

    auto &mask_dim = mask.dims();
    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
@@ -56,7 +58,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
      framework::TensorCopy(mask, platform::CPUPlace(), dev_ctx,
                            cpu_mask.get());
 #else
-      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Not supported GPU, Please recompile or reinstall paddle with CUDA "
+          "support."));
 #endif
    }
    auto *mask_data = cpu_mask->data<bool>();
@@ -109,7 +113,11 @@ class MergeLoDTensorOp : public framework::OperatorBase {
      size_t start_offset = lod_and_offset.second.first;
      size_t end_offset = lod_and_offset.second.second;

-      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      PADDLE_ENFORCE_GE(end_offset, start_offset,
+                        platform::errors::InvalidArgument(
+                            "The end offset less than start offset, end offset "
+                            "is %d, start offset is %d.",
+                            end_offset, start_offset));
      size_t len = end_offset - start_offset;
      if (len == 0) {
        continue;
@@ -189,22 +197,24 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
                   "merge_lod_tensor");
    auto mask_dim = context->GetInputDim("Mask");
    PADDLE_ENFORCE_EQ(mask_dim.size(), 2,
-                      "If you are using IfElse OP:"
-                      "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                      "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                      "Please ensure that the cond should be a 2-D tensor and "
-                      "the second dim size of cond should be 1. "
-                      "But now the cond's shape is [",
-                      *mask_dim.Get(), "].\n");
+                      platform::errors::InvalidArgument(
+                          "If you are using IfElse OP:"
+                          "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                          "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                          "Please ensure that the cond is a 2-D tensor and "
+                          "the second dim size of cond is 1. "
+                          "But now the cond's shape is [%s].\n",
+                          mask_dim));
    if (context->IsRuntime() || mask_dim[1] > 0) {
      PADDLE_ENFORCE_EQ(mask_dim[1], 1,
-                        "If you are using IfElse OP:"
-                        "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
-                        "ie.true_block():\n    out_1 = ie.input(x)\n\n"
-                        "Please ensure that the cond should be a 2-D tensor "
-                        "and the second dim size of cond should be 1. "
-                        "But now the cond's shape is [",
-                        *mask_dim.Get(), "].\n");
+                        platform::errors::InvalidArgument(
+                            "If you are using IfElse OP:"
+                            "\n\nie = fluid.layers.IfElse(cond=cond)\nwith "
+                            "ie.true_block():\n    out_1 = ie.input(x)\n\n"
+                            "Please ensure that the cond is a 2-D tensor "
+                            "and the second dim size of cond is 1. "
+                            "But now the cond's shape is [%s].\n",
+                            mask_dim));
    }

    context->SetOutputDim("Out", context->GetInputDim("InTrue"));

--- a/paddle/fluid/operators/mv_op.cc
+++ b/paddle/fluid/operators/mv_op.cc
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+namespace paddle {
+namespace operators {
+
+class MVOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The matrix input of mv op");
+    AddInput("Vec", "The vector input of mv op");
+    AddOutput("Out", "The output of mv op");
+    AddComment(R"DOC(
+MV Operator.
+
+This operator is used to perform matrix vector multiplication
+of the input tensors `X` and `Vec`.
+)DOC");
+  }
+};
+
+class MVOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Vec");
+    PADDLE_ENFORCE_EQ(
+        dim_x.size(), 2,
+        platform::errors::InvalidArgument(
+            "The rank of input X should be 2, but is %d", dim_x.size()));
+    PADDLE_ENFORCE_EQ(
+        dim_y.size(), 1,
+        platform::errors::InvalidArgument(
+            "The rank of input Vec should be 1, but is %d", dim_y.size()));
+    PADDLE_ENFORCE_EQ(dim_x[1] == dim_y[0], true,
+                      platform::errors::InvalidArgument(
+                          "The length of input X' second dim should equal the "
+                          "length of input Vec,"
+                          " but X[%d, %d], Vec[%d]",
+                          dim_x[0], dim_x[1], dim_y[0]));
+
+    framework::DDim dim_out = framework::make_ddim({dim_x[0]});
+
+    context->SetOutputDim("Out", dim_out);
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class MVOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("mv_grad");
+    retv->SetInput("X", this->Input("X"));
+    retv->SetInput("Vec", this->Input("Vec"));
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    retv->SetOutput(framework::GradVarName("Vec"), this->InputGrad("Vec"));
+  }
+};
+
+class MVOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv");
+    OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv");
+    OP_INOUT_CHECK(context->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@GRAD", "mv");
+    auto x_dims = context->GetInputDim("X");
+    auto vec_dims = context->GetInputDim("Vec");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto vec_grad_name = framework::GradVarName("Vec");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(vec_grad_name)) {
+      context->SetOutputDim(vec_grad_name, vec_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker,
+                  ops::MVOpGradMaker<paddle::framework::OpDesc>,
+                  ops::MVOpGradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(mv_grad, ops::MVOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mv_op.h"
+#include "paddle/fluid/platform/gpu_launch_param_config.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MVGradCUDAKernel(const int m, const int n, const T *dout,
+                                 const T *vec, T *dx) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < m * n; idx += blockDim.x * gridDim.x) {
+    int i = idx / n;
+    int j = idx % n;
+    dx[idx] = dout[i] * vec[j];
+  }
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// dX = | dOut Vec^T
+// dVec = | X^T dOut
+template <typename T>
+class MVGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<framework::Tensor>("X");
+    auto *vec = context.Input<framework::Tensor>("Vec");
+    auto *dout =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto *dvec =
+        context.Output<framework::Tensor>(framework::GradVarName("Vec"));
+
+    auto dim_x = x->dims();
+    int m = dim_x[0];
+    int n = dim_x[1];
+
+    dx->Resize(framework::make_ddim({m * n}));
+
+    // get data ptr
+    const T *x_data = x->data<T>();
+    const T *vec_data = vec->data<T>();
+    const T *dout_data = dout->data<T>();
+
+    T *dx_data = dx->mutable_data<T>(context.GetPlace());
+    T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+
+    // calculate dx
+    auto stream = context.cuda_device_context().stream();
+    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
+    MVGradCUDAKernel<
+        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+        m, n, dout_data, vec_data, dx_data);
+
+    dx->Resize(framework::make_ddim({m, n}));
+
+    // calculate dvec
+    blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
+              static_cast<T>(0), dvec_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    mv, ops::MVKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mv_grad, ops::MVGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MVGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc
--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.h
@@ -24,17 +24,19 @@ class DecayedAdagradOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE(param_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Param").front(),
-                   framework::ToTypeName(param_var->Type()));
+    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Param").front(),
+                          framework::ToTypeName(param_var->Type())));
    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE(grad_var->IsType<framework::LoDTensor>(),
-                   "The Var(%s)'s type should be LoDTensor, "
-                   "but the received is %s",
-                   ctx.InputNames("Grad").front(),
-                   framework::ToTypeName(grad_var->Type()));
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));

    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -30,7 +30,12 @@ class LarsMomentumOpKernel : public framework::OpKernel<T> {
    auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
    auto* grad_var = ctx.InputVar("Grad");
    // only support dense for now.
-    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true);
+    PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
+                      platform::errors::InvalidArgument(
+                          "The Var(%s)'s type should be LoDTensor, "
+                          "but the received is %s",
+                          ctx.InputNames("Grad").front(),
+                          framework::ToTypeName(grad_var->Type())));
    auto grad = ctx.Input<framework::LoDTensor>("Grad");

    param_out->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/optimizers/rmsprop_op.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cc
@@ -143,4 +143,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
--- a/paddle/fluid/operators/segment_pool_op.cc
+++ b/paddle/fluid/operators/segment_pool_op.cc
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
--- a/paddle/fluid/operators/top_k_v2_op.cc
+++ b/paddle/fluid/operators/top_k_v2_op.cc
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/fluid/platform/cuda_profiler.h
+++ b/paddle/fluid/platform/cuda_profiler.h
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
--- a/paddle/fluid/pybind/global_value_getter_setter.cc
+++ b/paddle/fluid/pybind/global_value_getter_setter.cc
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
--- a/paddle/fluid/train/demo/demo_trainer.cc
+++ b/paddle/fluid/train/demo/demo_trainer.cc
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
--- a/paddle/fluid/train/imdb_demo/README.md
+++ b/paddle/fluid/train/imdb_demo/README.md
--- a/paddle/fluid/train/imdb_demo/demo_trainer.cc
+++ b/paddle/fluid/train/imdb_demo/demo_trainer.cc
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
--- a/python/paddle/distributed/fleet/base/util_factory.py
+++ b/python/paddle/distributed/fleet/base/util_factory.py
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_graph_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
--- a/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
+++ b/python/paddle/distributed/fleet/runtime/parameter_server_runtime.py
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py
--- a/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
+++ b/python/paddle/fluid/contrib/slim/tests/convert_model2dot.py
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_image_classification_comparison.py
--- a/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant2_int8_nlp_comparison.py
--- a/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
+++ b/python/paddle/fluid/contrib/slim/tests/quant_int8_image_classification_comparison.py
--- a/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/save_quant_model.py
--- a/python/paddle/fluid/contrib/slim/tests/test_graph.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_graph.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_resnet50.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quant2_int8_mkldnn_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
--- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py
--- a/python/paddle/fluid/contrib/tests/test_correlation.py
+++ b/python/paddle/fluid/contrib/tests/test_correlation.py
--- a/python/paddle/fluid/contrib/tests/test_fp16_utils.py
+++ b/python/paddle/fluid/contrib/tests/test_fp16_utils.py
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_op.py
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
--- a/python/paddle/fluid/tests/unittests/c_comm_init_op.py
+++ b/python/paddle/fluid/tests/unittests/c_comm_init_op.py
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
--- a/python/paddle/fluid/tests/unittests/collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_api.py
--- a/python/paddle/fluid/tests/unittests/collective_allgather_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allgather_op.py
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_api.py
--- a/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_allreduce_op.py
--- a/python/paddle/fluid/tests/unittests/collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_barrier_api.py
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_api.py
--- a/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_broadcast_op.py
--- a/python/paddle/fluid/tests/unittests/collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_api.py
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
+++ b/python/paddle/fluid/tests/unittests/collective_reduce_op_calc_stream.py
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter.py
--- a/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_reducescatter_op.py
--- a/python/paddle/fluid/tests/unittests/collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_api.py
--- a/python/paddle/fluid/tests/unittests/collective_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/collective_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/dist_allreduce_op.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_yolov3.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_gru_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_lstm_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_seqconv_eltadd_relu_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py
--- a/python/paddle/fluid/tests/unittests/test_allgather.py
+++ b/python/paddle/fluid/tests/unittests/test_allgather.py
--- a/python/paddle/fluid/tests/unittests/test_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_allreduce.py
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint1.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint2.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint3.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_dist_basic.py
--- a/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_checkpoint_multiple.py
--- a/python/paddle/fluid/tests/unittests/test_broadcast.py
+++ b/python/paddle/fluid/tests/unittests/test_broadcast.py
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
--- a/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allgather_api.py
--- a/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_allreduce_api.py
--- a/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_barrier_api.py
--- a/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_broadcast_api.py
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce.py
--- a/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_reduce_api.py
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter.py
--- a/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_scatter_api.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_half_async.py
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py
--- a/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_sync.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_heter_program.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_simnet.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleet_save.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py
--- a/python/paddle/fluid/tests/unittests/test_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_async_decay.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler_config.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_graph_execution_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
--- a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
--- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
+++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
--- a/python/paddle/fluid/tests/unittests/test_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mv_op.py
--- a/python/paddle/fluid/tests/unittests/test_nan_inf.py
+++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py
--- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_combination.py
--- a/python/paddle/fluid/tests/unittests/test_reducescatter.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter.py
--- a/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
+++ b/python/paddle/fluid/tests/unittests/test_reducescatter_api.py
--- a/python/paddle/fluid/tests/unittests/test_segment_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_segment_ops.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
--- a/python/paddle/tests/test_text.py
+++ b/python/paddle/tests/test_text.py
--- a/python/paddle/text/datasets/uci_housing.py
+++ b/python/paddle/text/datasets/uci_housing.py
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/setup.py
+++ b/setup.py
--- a/tools/check_api_approvals.sh
+++ b/tools/check_api_approvals.sh
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
--- a/tools/wlist.json
+++ b/tools/wlist.json