diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28dc39920c6d0748588168820f5043b2360e3ac9..250907a020cc6e91256588ccf6e840d4ac513984 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -284,6 +284,27 @@ if(WITH_GPU)
     endif()
 endif()
 
+if(WITH_ROCM)
+    include(hip)
+    include(miopen) # set miopen libraries, must before configure
+endif(WITH_ROCM)
+
+if (NOT WITH_ROCM AND WITH_RCCL)
+    MESSAGE(WARNING
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
+endif()
+
+if(WITH_RCCL)
+     add_definitions("-DPADDLE_WITH_RCCL")
+     include(rccl)
+else()
+     if(WITH_ROCM)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -308,26 +329,6 @@ include(configure)          # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-if(WITH_ROCM)
-    include(hip)
-endif(WITH_ROCM)
-
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
-endif()
-
-if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
-else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
-endif()
-
 if(WITH_NV_JETSON)
     set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index e7f125269be1f5e015c6cf015489c312538ca4ba..458ab992c25f3818ae53b28fab38d9f986a36265 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,14 @@ elseif(WITH_ROCM)
     add_definitions(-DPADDLE_WITH_HIP)
     add_definitions(-DEIGEN_USE_GPU)
     add_definitions(-DEIGEN_USE_HIP)
+
+    if(NOT MIOPEN_FOUND)
+        message(FATAL_ERROR "Paddle needs MIOpen to compile")
+    endif()
+
+    if(${MIOPEN_VERSION} VERSION_LESS 2090)
+        message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
+    endif()
 else()
     add_definitions(-DHPPL_STUB_FUNC)
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 033b40622e25943285a301ca5a219ac2667a376f..9bdfc36201d5396e881e8bd54e06b85b0e8d566e 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable)
   if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
     set(cuda_arch_bin "30 35")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(cuda_arch_bin "50")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "53")
+    else()
+      set(cuda_arch_bin "50")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(cuda_arch_bin "60 61")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "62")
+    else()
+      set(cuda_arch_bin "60 61")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(cuda_arch_bin "70")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "72")
+    else()
+      set(cuda_arch_bin "70")
+    endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index f14195480b7dc80df0566c9b09075797010fe289..d88d693d8286d1efab5242fb758331ef64663a4d 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -46,6 +46,7 @@ ExternalProject_Add(
     ${BOOST_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${BOOST_DOWNLOAD_CMD}"
+    URL_MD5               f891e8c2c9424f0565f0129ad9ab4aff
     PREFIX                ${BOOST_PREFIX_DIR}
     DOWNLOAD_DIR          ${BOOST_SOURCE_DIR}
     SOURCE_DIR            ${BOOST_SOURCE_DIR}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index c37e28523f43c58c7cf5752ff1e1d26e9c3db4fd..ce5603b24b687daacea784c96fc00b828e513c97 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -110,7 +110,7 @@ if(WIN32)
     add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
         COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
     add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
+        COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on)
     add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
         COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64)
 else(WIN32)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 4cf9b626d15472206f47cd604d0b5b87089c4476..d99cb1952951c4c9e1ac7d7b1d727d28e8750ec6 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -24,6 +24,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF(WIN32)
     SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE)
     SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL_MD5             ff8c5237570f03eea37377ccfc95a08a)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@@ -33,6 +34,7 @@ ELSE()
     #  Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
     SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE)
     SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL_MD5             bc6a7faea6a2a9ad31752386f3ae87da)
     SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
     SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
     SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -52,6 +54,7 @@ ExternalProject_Add(
     ${MKLML_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     "${MKLML_DOWNLOAD_CMD}"
+    URL_MD5               ${MKLML_URL_MD5}
     PREFIX                ${MKLML_PREFIX_DIR}
     DOWNLOAD_DIR          ${MKLML_SOURCE_DIR}
     SOURCE_DIR            ${MKLML_SOURCE_DIR}
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f5b85cc71a25f12285bb02648df55c3d88ec8e53
--- /dev/null
+++ b/cmake/external/rocksdb.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ROCKSDB_SOURCES_DIR ${THIRD_PARTY_PATH}/rocksdb)
+SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
+SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE)
+SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE)
+SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_rocksdb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${ROCKSDB_SOURCES_DIR}
+    GIT_REPOSITORY "https://github.com/facebook/rocksdb"
+    GIT_TAG v6.10.1
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DWITH_BZ2=OFF
+               -DWITH_GFLAGS=OFF
+               -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+#    BUILD_BYPRODUCTS ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a
+    INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ 
+        && cp ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES}
+        && cp -r ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+
+ADD_DEPENDENCIES(extern_rocksdb snappy)
+
+ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
+ADD_DEPENDENCIES(rocksdb extern_rocksdb)
+
+LIST(APPEND external_project_dependencies rocksdb)
+
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index b0ea338d20525d8c40f2d4c1c92363c777c6ca67..6597e259aa890fde29ad49a37b1f450dfe6ad42d 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -24,7 +24,7 @@ SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 # in case of low internet speed  
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
+set(WARPCTC_TAG         37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
     CACHE PATH "Warp-ctc Directory" FORCE)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 9694a7bc59c12a96e1c0c33488895ae94dbf2a03..8a18fa4a5512b39aa1bcd10bbd7589bdde600ab5 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -146,12 +146,12 @@ copy(inference_lib_dist
         SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
         DSTS ${dst_dir})
 
-# Only GPU need cudaErrorMessage.pb
+# GPU must copy externalErrorMsg.pb
 IF(WITH_GPU)
-        set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
-        copy(inference_lib_dist
-                SRCS ${cudaerror_INCLUDE_DIR}
-                DSTS ${dst_dir})
+    set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
+    copy(inference_lib_dist
+            SRCS ${externalError_INCLUDE_DIR}
+            DSTS ${dst_dir})
 ENDIF()
 
 # CMakeCache Info
@@ -193,10 +193,7 @@ copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
@@ -259,7 +256,7 @@ copy(fluid_lib_dist
 set(module "platform")
 set(platform_lib_deps profiler_proto error_codes_proto)
 if(WITH_GPU)
-  set(platform_lib_deps ${platform_lib_deps} cuda_error_proto)
+  set(platform_lib_deps ${platform_lib_deps} external_error_proto)
 endif(WITH_GPU)
 
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
@@ -323,12 +320,18 @@ function(version version_file)
             "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
             "WITH_MKL: ${WITH_MKL}\n"
             "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-            "WITH_GPU: ${WITH_GPU}\n")
+            "WITH_GPU: ${WITH_GPU}\n"
+            "WITH_ROCM: ${WITH_ROCM}\n")
     if(WITH_GPU)
         file(APPEND ${version_file}
                 "CUDA version: ${CUDA_VERSION}\n"
                 "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
     endif()
+    if(WITH_ROCM)
+        file(APPEND ${version_file}
+                "HIP version: ${HIP_VERSION}\n"
+                "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
+    endif()
     file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
     if(TENSORRT_FOUND)
         file(APPEND ${version_file}
diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..f482f423dc5c12c5c0d7d87401c5d4a1d85a218a
--- /dev/null
+++ b/cmake/miopen.cmake
@@ -0,0 +1,67 @@
+if(NOT WITH_ROCM)
+    return()
+endif()
+
+# Now we don't support ROCm on windows
+if(WIN32)
+    return()
+endif()
+
+set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT")
+
+find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include
+          NO_DEFAULT_PATH
+)
+
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+
+find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} 
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 
+          NO_DEFAULT_PATH 
+    DOC "Path to MIOpen library.")
+
+if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY)
+    set(MIOPEN_FOUND ON)
+else()
+    set(MIOPEN_FOUND OFF)
+endif()
+
+macro(find_miopen_version miopen_header_file) 
+    file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
+    get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
+
+    string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
+        MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
+        MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
+        MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
+        MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
+
+    if(NOT MIOPEN_MAJOR_VERSION)
+        set(MIOPEN_VERSION "???")
+    else()
+        add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
+        math(EXPR MIOPEN_VERSION
+            "${MIOPEN_MAJOR_VERSION} * 1000 +
+             ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
+        message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
+          "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
+    endif()
+endmacro()
+
+if(MIOPEN_FOUND)
+  find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) 
+endif()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 56edaff2a50dab0f7029ec1e85fc3d4ce8ac416e..d33edef38ca7b36ce0b0474407ae7363884bfdaa 100644
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -108,13 +108,19 @@ ENDMACRO()
 # 2. NAME:          The name of file, that determin the dirname
 #
 FUNCTION(file_download_and_uncompress URL NAME)
-  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  set(options "")
+  set(oneValueArgs MD5)
+  set(multiValueArgs "")
+  cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}")
   SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
   ExternalProject_Add(
-      extern_download_${NAME}
+      download_${NAME}
       ${EXTERNAL_PROJECT_LOG_ARGS}
       PREFIX                ${THIRD_PARTY_PATH}/${NAME}
       URL                   ${URL}
+      URL_MD5               ${URL_MD5}
+      TIMEOUT               120
       DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
       SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
       DOWNLOAD_NO_PROGRESS  1
@@ -123,7 +129,7 @@ FUNCTION(file_download_and_uncompress URL NAME)
       UPDATE_COMMAND        ""
       INSTALL_COMMAND       ""
     )
-  set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE)
+  set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
 ENDFUNCTION()
 
 
@@ -242,8 +248,20 @@ if(WITH_GPU)
         include(external/cub)       # download cub
         list(APPEND third_party_deps extern_cub)
     endif()
-    set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
-    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
+    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
+    file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4)   # download file externalErrorMsg.tar.gz
+    if(WITH_TESTING)
+        # copy externalErrorMsg.pb for unittest 'enforce_test'
+        set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
+        if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
+            set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
+        else()
+            set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
+        endif()
+        add_custom_command(TARGET download_externalError POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR}
+            COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
+    endif()
 endif(WITH_GPU)
 
 if(WITH_XPU)
@@ -304,6 +322,11 @@ if (WITH_PSCORE)
 
     include(external/libmct)     # download, build, install libmct
     list(APPEND third_party_deps extern_libmct)
+    
+    if (WITH_HETERPS)
+        include(external/rocksdb)     # download, build, install libmct
+        list(APPEND third_party_deps extern_rocksdb)
+    endif()
 endif()
 
 if(WITH_XBYAK)
diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc
index dfd55f16e1a065e46b2186a6a589eabc1ac3b431..9e2a0b35224a4ea3a6198e20309d3a335999651e 100644
--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -417,8 +417,10 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
   return;
 }
 
-void FleetWrapper::LoadModel(const std::string& path, const int mode) {
-  auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
+void FleetWrapper::LoadModel(const std::string& path, const std::string& mode) {
+  auto* communicator = Communicator::GetInstance();
+  auto ret = communicator->_worker_ptr->load(path, mode);
+  // auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
   ret.wait();
   if (ret.get() != 0) {
     LOG(ERROR) << "load model from path:" << path << " failed";
@@ -429,8 +431,11 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) {
 
 void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
                                      const std::string& path, const int mode) {
+  auto* communicator = Communicator::GetInstance();
   auto ret =
-      pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
+      communicator->_worker_ptr->load(table_id, path, std::to_string(mode));
+  // auto ret =
+  //    pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
   ret.wait();
   if (ret.get() != 0) {
     LOG(ERROR) << "load model of table id: " << table_id
diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h
index 0da5d1e2bf987f38de3b9a03c659fc5e1841eca1..1b2bde85de04c2f0dc528700f10d087199c56c50 100644
--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -200,7 +200,7 @@ class FleetWrapper {
   void PrintTableStat(const uint64_t table_id);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
-  void LoadModel(const std::string& path, const int mode);
+  void LoadModel(const std::string& path, const std::string& mode);
   // mode = 0, load all feature
   // mode = 1, load delta feature, which means load diff
   void LoadModelOneTable(const uint64_t table_id, const std::string& path,
diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc
index 2acc845a50890beb834676c3394f8dabc2a77e78..e949b21b02e6d9842ffae377a17610757a65ae75 100644
--- a/paddle/fluid/distributed/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
@@ -42,17 +42,17 @@ int32_t PsLocalClient::initialize() {
 ::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
                                            const std::string& mode) {
   // TODO
-  // for (auto& it : _table_map) {
-  //    load(it.first, epoch, mode);
-  //}
+  for (auto& it : _table_map) {
+    load(it.first, epoch, mode);
+  }
   return done();
 }
 ::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
                                            const std::string& epoch,
                                            const std::string& mode) {
   // TODO
-  // auto* table_ptr = table(table_id);
-  // table_ptr->load(epoch, mode);
+  auto* table_ptr = table(table_id);
+  table_ptr->load(epoch, mode);
   return done();
 }
 
@@ -245,7 +245,6 @@ int32_t PsLocalClient::initialize() {
 ::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
     size_t table_id, const uint64_t* keys, const float** update_values,
     size_t num, void* callback) {
-  VLOG(1) << "wxx push_sparse_raw_gradient";
   PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
   auto* accessor = table_accessor(table_id);
   auto* table_ptr = table(table_id);
diff --git a/paddle/fluid/distributed/service/ps_local_server.h b/paddle/fluid/distributed/service/ps_local_server.h
index dfbccc70900e3cf10fbb0852a114e400d738e2d6..33b0b5fa796d7571e16a0f79fc6ce4de21b1e7a8 100644
--- a/paddle/fluid/distributed/service/ps_local_server.h
+++ b/paddle/fluid/distributed/service/ps_local_server.h
@@ -26,9 +26,14 @@ class PsLocalServer : public PSServer {
   PsLocalServer() {}
   virtual ~PsLocalServer() {}
   virtual uint64_t start() { return 0; }
-  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
   virtual int32_t stop() { return 0; }
   virtual int32_t port() { return 0; }
+  virtual int32_t configure(
+      const PSParameter &config, PSEnvironment &env, size_t server_rank,
+      const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
+    return 0;
+  }
 
  private:
   virtual int32_t initialize() { return 0; }
diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/service/server.h
index 74a8cbe44b144b75f33a9c392ffdc80148a82011..89b089386f501835b7c384477b84f98f94c2a4a9 100644
--- a/paddle/fluid/distributed/service/server.h
+++ b/paddle/fluid/distributed/service/server.h
@@ -70,7 +70,7 @@ class PSServer {
 
   virtual int32_t configure(
       const PSParameter &config, PSEnvironment &env, size_t server_rank,
-      const std::vector<framework::ProgramDesc> &server_sub_program = {}) final;
+      const std::vector<framework::ProgramDesc> &server_sub_program = {});
 
   // return server_ip
   virtual std::string ip() { return butil::my_ip_cstr(); }
diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt
index dab390958034af284baaffcb909d8b941fc3b9d1..c928ebe90ceb9e6a6c2cd7983d112c9a6f9af6b3 100644
--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -9,15 +9,24 @@ set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS $
 cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
 
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
-sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
-${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
+set(EXTERN_DEP "")
+if(WITH_HETERPS)
+    set(TABLE_SRC common_sparse_table.cc ssd_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
+    set(EXTERN_DEP rocksdb)
+else()
+    set(TABLE_SRC common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
+endif()
+
+cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper
+simple_threadpool xxhash generator ${EXTERN_DEP})
 
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc
index b667aec186f9e343228d58c84e580554ed55a698..e1223face0f54ac782fa41ff16a2db1b08aa413a 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -25,83 +25,12 @@ class ValueBlock;
 }  // namespace distributed
 }  // namespace paddle
 
-#define PSERVER_SAVE_SUFFIX ".shard"
-using boost::lexical_cast;
-
 namespace paddle {
 namespace distributed {
 
-enum SaveMode { all, base, delta };
-
-struct Meta {
-  std::string param;
-  int shard_id;
-  std::vector<std::string> names;
-  std::vector<int> dims;
-  uint64_t count;
-  std::unordered_map<std::string, int> dims_map;
-
-  explicit Meta(const std::string& metapath) {
-    std::ifstream file(metapath);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      if (StartWith(line, "#")) {
-        continue;
-      }
-      auto pairs = paddle::string::split_string<std::string>(line, "=");
-      PADDLE_ENFORCE_EQ(
-          pairs.size(), 2,
-          paddle::platform::errors::InvalidArgument(
-              "info in %s except k=v, but got %s", metapath, line));
-
-      if (pairs[0] == "param") {
-        param = pairs[1];
-      }
-      if (pairs[0] == "shard_id") {
-        shard_id = std::stoi(pairs[1]);
-      }
-      if (pairs[0] == "row_names") {
-        names = paddle::string::split_string<std::string>(pairs[1], ",");
-      }
-      if (pairs[0] == "row_dims") {
-        auto dims_strs =
-            paddle::string::split_string<std::string>(pairs[1], ",");
-        for (auto& str : dims_strs) {
-          dims.push_back(std::stoi(str));
-        }
-      }
-      if (pairs[0] == "count") {
-        count = std::stoull(pairs[1]);
-      }
-    }
-    for (int x = 0; x < names.size(); ++x) {
-      dims_map[names[x]] = dims[x];
-    }
-  }
-
-  Meta(std::string param, int shard_id, std::vector<std::string> row_names,
-       std::vector<int> dims, uint64_t count) {
-    this->param = param;
-    this->shard_id = shard_id;
-    this->names = row_names;
-    this->dims = dims;
-    this->count = count;
-  }
-
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "param=" << param << "\n";
-    ss << "shard_id=" << shard_id << "\n";
-    ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
-    ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
-    ss << "count=" << count << "\n";
-    return ss.str();
-  }
-};
-
-void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
-                  const int64_t id, std::vector<std::vector<float>>* values) {
+void CommonSparseTable::ProcessALine(const std::vector<std::string>& columns,
+                                     const Meta& meta, const int64_t id,
+                                     std::vector<std::vector<float>>* values) {
   auto colunmn_size = columns.size();
   auto load_values =
       paddle::string::split_string<std::string>(columns[colunmn_size - 1], ",");
@@ -134,8 +63,10 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
   }
 }
 
-void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
-                    const size_t shard_idx, const int64_t total) {
+void CommonSparseTable::SaveMetaToText(std::ostream* os,
+                                       const CommonAccessorParameter& common,
+                                       const size_t shard_idx,
+                                       const int64_t total) {
   // save meta
   std::stringstream stream;
   stream << "param=" << common.table_name() << "\n";
@@ -148,8 +79,10 @@ void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
   os->write(stream.str().c_str(), sizeof(char) * stream.str().size());
 }
 
-int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
-                        std::shared_ptr<::ThreadPool> pool, const int mode) {
+int64_t CommonSparseTable::SaveValueToText(std::ostream* os,
+                                           std::shared_ptr<ValueBlock> block,
+                                           std::shared_ptr<::ThreadPool> pool,
+                                           const int mode, int shard_id) {
   int64_t save_num = 0;
   for (auto& table : block->values_) {
     for (auto& value : table) {
@@ -186,10 +119,10 @@ int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
   return save_num;
 }
 
-int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
-                     const int pserver_id, const int pserver_num,
-                     const int local_shard_num,
-                     std::vector<std::shared_ptr<ValueBlock>>* blocks) {
+int64_t CommonSparseTable::LoadFromText(
+    const std::string& valuepath, const std::string& metapath,
+    const int pserver_id, const int pserver_num, const int local_shard_num,
+    std::vector<std::shared_ptr<ValueBlock>>* blocks) {
   Meta meta = Meta(metapath);
 
   int num_lines = 0;
@@ -198,7 +131,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
 
   while (std::getline(file, line)) {
     auto values = paddle::string::split_string<std::string>(line, "\t");
-    auto id = lexical_cast<int64_t>(values[0]);
+    auto id = lexical_cast<uint64_t>(values[0]);
 
     if (id % pserver_num != pserver_id) {
       VLOG(3) << "will not load " << values[0] << " from " << valuepath
@@ -388,8 +321,9 @@ int32_t CommonSparseTable::save(const std::string& dirname,
   int64_t total_ins = 0;
   for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
     // save values
-    auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id],
-                                          _shards_task_pool[shard_id], mode);
+    auto shard_save_num =
+        SaveValueToText(vs.get(), shard_values_[shard_id],
+                        _shards_task_pool[shard_id], mode, shard_id);
     total_ins += shard_save_num;
   }
   vs->close();
diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h
index 50c295da53464c8cc1589b27a6dbc233367991b4..ce3cc11686a4807e9de616e2de2dc1d9b1e7c3f9 100644
--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -32,11 +32,83 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/string/string_helper.h"
 
+#define PSERVER_SAVE_SUFFIX ".shard"
+using boost::lexical_cast;
+
 namespace paddle {
 namespace distributed {
 
 class SparseOptimizer;
 
+enum SaveMode { all, base, delta };
+
+struct Meta {
+  std::string param;
+  int shard_id;
+  std::vector<std::string> names;
+  std::vector<int> dims;
+  uint64_t count;
+  std::unordered_map<std::string, int> dims_map;
+
+  explicit Meta(const std::string& metapath) {
+    std::ifstream file(metapath);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      if (StartWith(line, "#")) {
+        continue;
+      }
+      auto pairs = paddle::string::split_string<std::string>(line, "=");
+      PADDLE_ENFORCE_EQ(
+          pairs.size(), 2,
+          paddle::platform::errors::InvalidArgument(
+              "info in %s except k=v, but got %s", metapath, line));
+
+      if (pairs[0] == "param") {
+        param = pairs[1];
+      }
+      if (pairs[0] == "shard_id") {
+        shard_id = std::stoi(pairs[1]);
+      }
+      if (pairs[0] == "row_names") {
+        names = paddle::string::split_string<std::string>(pairs[1], ",");
+      }
+      if (pairs[0] == "row_dims") {
+        auto dims_strs =
+            paddle::string::split_string<std::string>(pairs[1], ",");
+        for (auto& str : dims_strs) {
+          dims.push_back(std::stoi(str));
+        }
+      }
+      if (pairs[0] == "count") {
+        count = std::stoull(pairs[1]);
+      }
+    }
+    for (int x = 0; x < names.size(); ++x) {
+      dims_map[names[x]] = dims[x];
+    }
+  }
+
+  Meta(std::string param, int shard_id, std::vector<std::string> row_names,
+       std::vector<int> dims, uint64_t count) {
+    this->param = param;
+    this->shard_id = shard_id;
+    this->names = row_names;
+    this->dims = dims;
+    this->count = count;
+  }
+
+  std::string ToString() {
+    std::stringstream ss;
+    ss << "param=" << param << "\n";
+    ss << "shard_id=" << shard_id << "\n";
+    ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
+    ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
+    ss << "count=" << count << "\n";
+    return ss.str();
+  }
+};
+
 class CommonSparseTable : public SparseTable {
  public:
   CommonSparseTable() { rwlock_.reset(new framework::RWLock); }
@@ -56,9 +128,25 @@ class CommonSparseTable : public SparseTable {
   virtual int32_t initialize_optimizer();
   virtual int32_t initialize_recorder();
 
-  int32_t load(const std::string& path, const std::string& param);
+  virtual int32_t load(const std::string& path, const std::string& param);
+
+  virtual int32_t save(const std::string& path, const std::string& param);
+
+  void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                      const size_t shard_idx, const int64_t total);
 
-  int32_t save(const std::string& path, const std::string& param);
+  int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                          std::shared_ptr<::ThreadPool> pool, const int mode,
+                          int shard_id);
+
+  virtual void ProcessALine(const std::vector<std::string>& columns,
+                            const Meta& meta, const int64_t id,
+                            std::vector<std::vector<float>>* values);
+
+  virtual int64_t LoadFromText(
+      const std::string& valuepath, const std::string& metapath,
+      const int pserver_id, const int pserver_num, const int local_shard_num,
+      std::vector<std::shared_ptr<ValueBlock>>* blocks);
 
   virtual std::pair<int64_t, int64_t> print_table_stat();
   virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
@@ -89,7 +177,7 @@ class CommonSparseTable : public SparseTable {
   virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
                                size_t num);
 
- private:
+ protected:
   const int task_pool_size_ = 11;
   std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;
 
diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h
index 5c10fca98cda4d6cbdcb430ab5f2b8016a6ff7f2..ac11183d192fffcec80dc1d4a586cda95751c6cd 100644
--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -83,6 +83,7 @@ inline bool probility_entry(VALUE *value, float threshold) {
 
 class ValueBlock {
  public:
+  typedef typename robin_hood::unordered_map<uint64_t, VALUE *> map_type;
   explicit ValueBlock(const std::vector<std::string> &value_names,
                       const std::vector<int> &value_dims,
                       const std::vector<int> &value_offsets,
@@ -261,6 +262,18 @@ class ValueBlock {
     value->is_entry_ = state;
   }
 
+  void erase(uint64_t feasign) {
+    size_t hash = _hasher(feasign);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    auto iter = table.find(feasign);
+    if (iter != table.end()) {
+      butil::return_object(iter->second);
+      iter = table.erase(iter);
+    }
+  }
+
   void Shrink(const int threshold) {
     for (auto &table : values_) {
       for (auto iter = table.begin(); iter != table.end();) {
@@ -289,6 +302,23 @@ class ValueBlock {
     }
   }
 
+  map_type::iterator end() {
+    return values_[SPARSE_SHARD_BUCKET_NUM - 1].end();
+  }
+
+  map_type::iterator Find(uint64_t id) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+
+    auto got = table.find(id);
+    if (got == table.end()) {
+      return end();
+    } else {
+      return got;
+    }
+  }
+
  private:
   bool Has(const uint64_t id) {
     size_t hash = _hasher(id);
@@ -304,7 +334,7 @@ class ValueBlock {
   }
 
  public:
-  robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM];
+  map_type values_[SPARSE_SHARD_BUCKET_NUM];
   size_t value_length_ = 0;
   std::hash<uint64_t> _hasher;
 
diff --git a/paddle/fluid/distributed/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/table/depends/rocksdb_warpper.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e25a89cb14d7293045cde871ad2ae0ce1cb5d66
--- /dev/null
+++ b/paddle/fluid/distributed/table/depends/rocksdb_warpper.h
@@ -0,0 +1,158 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_HETERPS
+#include <glog/logging.h>
+#include <rocksdb/db.h>
+#include <rocksdb/filter_policy.h>
+#include <rocksdb/options.h>
+#include <rocksdb/slice.h>
+#include <rocksdb/table.h>
+#include <rocksdb/write_batch.h>
+#include <iostream>
+#include <string>
+
+namespace paddle {
+namespace distributed {
+
+class RocksDBHandler {
+ public:
+  RocksDBHandler() {}
+  ~RocksDBHandler() {}
+
+  static RocksDBHandler* GetInstance() {
+    static RocksDBHandler handler;
+    return &handler;
+  }
+
+  int initialize(const std::string& db_path, const int colnum) {
+    VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
+    rocksdb::Options options;
+    rocksdb::BlockBasedTableOptions bbto;
+    bbto.block_size = 4 * 1024;
+    bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
+    bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
+    bbto.cache_index_and_filter_blocks = false;
+    bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+
+    options.keep_log_file_num = 100;
+    options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+    options.create_if_missing = true;
+    options.use_direct_reads = true;
+    options.max_background_flushes = 5;
+    options.max_background_compactions = 5;
+    options.base_background_compactions = 10;
+    options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
+    options.max_write_buffer_number = 8;
+    options.max_bytes_for_level_base =
+        options.max_write_buffer_number * options.write_buffer_size;
+    options.min_write_buffer_number_to_merge = 1;
+    options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+    options.memtable_prefix_bloom_size_ratio = 0.02;
+    options.num_levels = 4;
+    options.max_open_files = -1;
+
+    options.compression = rocksdb::kNoCompression;
+    options.level0_file_num_compaction_trigger = 8;
+    options.level0_slowdown_writes_trigger =
+        1.8 * options.level0_file_num_compaction_trigger;
+    options.level0_stop_writes_trigger =
+        3.6 * options.level0_file_num_compaction_trigger;
+
+    if (!db_path.empty()) {
+      std::string rm_cmd = "rm -rf " + db_path;
+      system(rm_cmd.c_str());
+    }
+
+    rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
+    assert(s.ok());
+    _handles.resize(colnum);
+    for (int i = 0; i < colnum; i++) {
+      s = _db->CreateColumnFamily(options, "shard_" + std::to_string(i),
+                                  &_handles[i]);
+      assert(s.ok());
+    }
+    LOG(INFO) << "DB initialize success, colnum:" << colnum;
+    return 0;
+  }
+
+  int put(int id, const char* key, int key_len, const char* value,
+          int value_len) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::Status s =
+        _db->Put(options, _handles[id], rocksdb::Slice(key, key_len),
+                 rocksdb::Slice(value, value_len));
+    assert(s.ok());
+    return 0;
+  }
+
+  int put_batch(int id, std::vector<std::pair<char*, int>>& ssd_keys,
+                std::vector<std::pair<char*, int>>& ssd_values, int n) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::WriteBatch batch(n * 128);
+    for (int i = 0; i < n; i++) {
+      batch.Put(_handles[id],
+                rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
+                rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
+    }
+    rocksdb::Status s = _db->Write(options, &batch);
+    assert(s.ok());
+    return 0;
+  }
+
+  int get(int id, const char* key, int key_len, std::string& value) {
+    rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), _handles[id],
+                                 rocksdb::Slice(key, key_len), &value);
+    if (s.IsNotFound()) {
+      return 1;
+    }
+    assert(s.ok());
+    return 0;
+  }
+
+  int del_data(int id, const char* key, int key_len) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::Status s =
+        _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
+    assert(s.ok());
+    return 0;
+  }
+
+  int flush(int id) {
+    rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
+    assert(s.ok());
+    return 0;
+  }
+
+  rocksdb::Iterator* get_iterator(int id) {
+    return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
+  }
+
+  int get_estimate_key_num(uint64_t& num_keys) {
+    _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    return 0;
+  }
+
+ private:
+  std::vector<rocksdb::ColumnFamilyHandle*> _handles;
+  rocksdb::DB* _db;
+};
+}
+}
+#endif
diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/table/ssd_sparse_table.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5de6de3d2909d670c4bfdabdac37e72fcb125d5e
--- /dev/null
+++ b/paddle/fluid/distributed/table/ssd_sparse_table.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+
+DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
+
+namespace paddle {
+namespace distributed {
+
+int32_t SSDSparseTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+
+  sync = _config.common().sync();
+  VLOG(1) << "table " << _config.common().table_name() << " is sync: " << sync;
+
+  _global_lr = new float(1.0);
+
+  auto common = _config.common();
+  int size = static_cast<int>(common.params().size());
+
+  size_t offset = 0;
+  for (int x = 0; x < size; ++x) {
+    auto& varname = common.params()[x];
+    auto& dim = common.dims()[x];
+
+    value_idx_[varname] = x;
+    value_names_.push_back(varname);
+    value_dims_.push_back(dim);
+    value_offsets_.push_back(offset);
+    initializer_attrs_.push_back(common.initializers()[x]);
+
+    if (varname == "Param") {
+      param_dim_ = dim;
+      param_offset_ = offset;
+    }
+
+    offset += dim;
+  }
+
+  initialize_value();
+  initialize_optimizer();
+  initialize_recorder();
+  _db = paddle::distributed::RocksDBHandler::GetInstance();
+  _db->initialize(FLAGS_rocksdb_path, task_pool_size_);
+  return 0;
+}
+
+int32_t SSDSparseTable::pull_sparse(float* pull_values,
+                                    const PullSparseValue& pull_value) {
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
+          auto& block = shard_values_[shard_id];
+
+          std::vector<int> offsets;
+          pull_value.Fission(shard_id, shard_num, &offsets);
+
+          for (auto& offset : offsets) {
+            auto feasign = pull_value.feasigns_[offset];
+            auto frequencie = pull_value.frequencies_[offset];
+            float* embedding = nullptr;
+            auto iter = block->Find(feasign);
+            // in mem
+            if (iter == block->end()) {
+              embedding = iter->second->data_.data();
+              if (pull_value.is_training_) {
+                block->AttrUpdate(iter->second, frequencie);
+              }
+            } else {
+              // need create
+              std::string tmp_str("");
+              if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
+                           tmp_str) > 0) {
+                embedding = block->Init(feasign, true, frequencie);
+              } else {
+                // in db
+                int data_size = tmp_str.size() / sizeof(float);
+                int value_size = block->value_length_;
+                float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
+                VALUE* value = block->InitGet(feasign);
+
+                // copy to mem
+                memcpy(value->data_.data(), db_value,
+                       value_size * sizeof(float));
+                embedding = db_value;
+
+                // param, count, unseen_day
+                value->count_ = db_value[value_size];
+                value->unseen_days_ = db_value[value_size + 1];
+                value->is_entry_ = db_value[value_size + 2];
+                if (pull_value.is_training_) {
+                  block->AttrUpdate(value, frequencie);
+                }
+              }
+            }
+            std::copy_n(embedding + param_offset_, param_dim_,
+                        pull_values + param_dim_ * offset);
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t SSDSparseTable::pull_sparse_ptr(char** pull_values,
+                                        const uint64_t* keys, size_t num) {
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &pull_values, &offset_bucket]() -> int {
+          auto& block = shard_values_[shard_id];
+          auto& offsets = offset_bucket[shard_id];
+
+          for (auto& offset : offsets) {
+            auto feasign = keys[offset];
+            auto iter = block->Find(feasign);
+            VALUE* value = nullptr;
+            // in mem
+            if (iter != block->end()) {
+              value = iter->second;
+            } else {
+              // need create
+              std::string tmp_str("");
+              if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
+                           tmp_str) > 0) {
+                value = block->InitGet(feasign);
+              } else {
+                // in db
+                int data_size = tmp_str.size() / sizeof(float);
+                int value_size = block->value_length_;
+                float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
+                value = block->InitGet(feasign);
+
+                // copy to mem
+                memcpy(value->data_.data(), db_value,
+                       value_size * sizeof(float));
+
+                // param, count, unseen_day
+                value->count_ = db_value[value_size];
+                value->unseen_days_ = db_value[value_size + 1];
+                value->is_entry_ = db_value[value_size + 2];
+              }
+            }
+            pull_values[offset] = (char*)value;
+          }
+          return 0;
+        });
+  }
+
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+
+int32_t SSDSparseTable::shrink(const std::string& param) { return 0; }
+
+int32_t SSDSparseTable::update_table() {
+  int count = 0;
+  int value_size = shard_values_[0]->value_length_;
+  int db_size = 3 + value_size;
+  float tmp_value[db_size];
+
+  for (size_t i = 0; i < task_pool_size_; ++i) {
+    auto& block = shard_values_[i];
+
+    for (auto& table : block->values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        VALUE* value = iter->second;
+        if (value->unseen_days_ >= 1) {
+          tmp_value[value_size] = value->count_;
+          tmp_value[value_size + 1] = value->unseen_days_;
+          tmp_value[value_size + 2] = value->is_entry_;
+          memcpy(tmp_value, value->data_.data(), sizeof(float) * value_size);
+          _db->put(i, (char*)&(iter->first), sizeof(uint64_t), (char*)tmp_value,
+                   db_size * sizeof(float));
+          count++;
+
+          butil::return_object(iter->second);
+          iter = table.erase(iter);
+        } else {
+          ++iter;
+        }
+      }
+    }
+    _db->flush(i);
+  }
+  VLOG(1) << "Table>> update count: " << count;
+  return 0;
+}
+
+int64_t SSDSparseTable::SaveValueToText(std::ostream* os,
+                                        std::shared_ptr<ValueBlock> block,
+                                        std::shared_ptr<::ThreadPool> pool,
+                                        const int mode, int shard_id) {
+  int64_t save_num = 0;
+
+  for (auto& table : block->values_) {
+    for (auto& value : table) {
+      if (mode == SaveMode::delta && !value.second->need_save_) {
+        continue;
+      }
+
+      ++save_num;
+
+      std::stringstream ss;
+      auto* vs = value.second->data_.data();
+
+      auto id = value.first;
+
+      ss << id << "\t" << value.second->count_ << "\t"
+         << value.second->unseen_days_ << "\t" << value.second->is_entry_
+         << "\t";
+
+      for (int i = 0; i < block->value_length_ - 1; i++) {
+        ss << std::to_string(vs[i]) << ",";
+      }
+
+      ss << std::to_string(vs[block->value_length_ - 1]);
+      ss << "\n";
+
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+
+      if (mode == SaveMode::base || mode == SaveMode::delta) {
+        value.second->need_save_ = false;
+      }
+    }
+  }
+
+  if (mode != 1) {
+    int value_size = block->value_length_;
+    auto* it = _db->get_iterator(shard_id);
+
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      float* value = (float*)const_cast<char*>(it->value().data());
+      std::stringstream ss;
+      ss << *((uint64_t*)const_cast<char*>(it->key().data())) << "\t"
+         << value[value_size] << "\t" << value[value_size + 1] << "\t"
+         << value[value_size + 2] << "\t";
+      for (int i = 0; i < block->value_length_ - 1; i++) {
+        ss << std::to_string(value[i]) << ",";
+      }
+
+      ss << std::to_string(value[block->value_length_ - 1]);
+      ss << "\n";
+
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+    }
+  }
+
+  return save_num;
+}
+
+int32_t SSDSparseTable::load(const std::string& path,
+                             const std::string& param) {
+  rwlock_->WRLock();
+  VLOG(3) << "ssd sparse table load with " << path << " with meta " << param;
+  LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
+               &shard_values_);
+  rwlock_->UNLock();
+  return 0;
+}
+
+int64_t SSDSparseTable::LoadFromText(
+    const std::string& valuepath, const std::string& metapath,
+    const int pserver_id, const int pserver_num, const int local_shard_num,
+    std::vector<std::shared_ptr<ValueBlock>>* blocks) {
+  Meta meta = Meta(metapath);
+
+  int num_lines = 0;
+  std::ifstream file(valuepath);
+  std::string line;
+
+  int value_size = shard_values_[0]->value_length_;
+  int db_size = 3 + value_size;
+  float tmp_value[db_size];
+
+  while (std::getline(file, line)) {
+    auto values = paddle::string::split_string<std::string>(line, "\t");
+    auto id = lexical_cast<uint64_t>(values[0]);
+
+    if (id % pserver_num != pserver_id) {
+      VLOG(3) << "will not load " << values[0] << " from " << valuepath
+              << ", please check id distribution";
+      continue;
+    }
+
+    auto shard_id = id % local_shard_num;
+    auto block = blocks->at(shard_id);
+
+    std::vector<std::vector<float>> kvalues;
+    ProcessALine(values, meta, id, &kvalues);
+
+    block->Init(id, false);
+
+    VALUE* value_instant = block->GetValue(id);
+
+    if (values.size() == 5) {
+      value_instant->count_ = lexical_cast<int>(values[1]);
+      value_instant->unseen_days_ = lexical_cast<int>(values[2]);
+      value_instant->is_entry_ =
+          static_cast<bool>(lexical_cast<int>(values[3]));
+    }
+
+    std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
+    auto blas = GetBlas<float>();
+    for (int x = 0; x < meta.names.size(); ++x) {
+      blas.VCOPY(meta.dims[x], kvalues[x].data(), block_values[x]);
+    }
+    VLOG(3) << "loading: " << id
+            << "unseen day: " << value_instant->unseen_days_;
+    if (value_instant->unseen_days_ >= 1) {
+      tmp_value[value_size] = value_instant->count_;
+      tmp_value[value_size + 1] = value_instant->unseen_days_;
+      tmp_value[value_size + 2] = value_instant->is_entry_;
+      memcpy(tmp_value, value_instant->data_.data(),
+             sizeof(float) * value_size);
+      _db->put(shard_id, (char*)&(id), sizeof(uint64_t), (char*)tmp_value,
+               db_size * sizeof(float));
+      block->erase(id);
+    }
+  }
+
+  return 0;
+}
+
+}  // namespace ps
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.h b/paddle/fluid/distributed/table/ssd_sparse_table.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e85fa3ce59d13c1f996f00a4b5b7dd9114ed764
--- /dev/null
+++ b/paddle/fluid/distributed/table/ssd_sparse_table.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h"
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace distributed {
+class SSDSparseTable : public CommonSparseTable {
+ public:
+  SSDSparseTable() {}
+  virtual ~SSDSparseTable() {}
+
+  virtual int32_t initialize() override;
+
+  void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                      const size_t shard_idx, const int64_t total);
+
+  int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                          std::shared_ptr<::ThreadPool> pool, const int mode,
+                          int shard_id);
+
+  virtual int64_t LoadFromText(
+      const std::string& valuepath, const std::string& metapath,
+      const int pserver_id, const int pserver_num, const int local_shard_num,
+      std::vector<std::shared_ptr<ValueBlock>>* blocks);
+
+  virtual int32_t load(const std::string& path, const std::string& param);
+
+  // exchange data
+  virtual int32_t update_table();
+
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
+
+  virtual int32_t flush() override { return 0; }
+  virtual int32_t shrink(const std::string& param) override;
+  virtual void clear() override {}
+
+ private:
+  RocksDBHandler* _db;
+  int64_t _cache_tk_size;
+};
+
+}  // namespace ps
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc
index 600be954cb59663fff6f867c020248a92e81a151..0f8753c074634189ffd39350425e6c1936569631 100644
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -21,6 +21,9 @@
 #include "paddle/fluid/distributed/table/common_graph_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+#endif
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
 #include "paddle/fluid/distributed/table/tensor_table.h"
 
@@ -29,6 +32,9 @@ namespace distributed {
 REGISTER_PSCORE_CLASS(Table, GraphTable);
 REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+#endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
 REGISTER_PSCORE_CLASS(Table, TensorTable);
diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h
index 3890631a6f8a9e99948e32cdd3cb8c1e00c2de75..a0816b65a3d15c9cf1384d1b6f18fa79f9199a83 100644
--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -16,15 +16,14 @@ limitations under the License. */
 #include <cstdint>
 #include <string>
 
-#include "complex128.h"     // NOLINT
-#include "complex64.h"      // NOLINT
+#include "complex.h"        // NOLINT
 #include "ext_exception.h"  // NOLINT
 #include "float16.h"        // NOLINT
 
 namespace paddle {
 
-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
+using complex64 = paddle::platform::complex<float>;
+using complex128 = paddle::platform::complex<double>;
 using float16 = paddle::platform::float16;
 
 enum class DataType {
diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc
index 8b2f7cc5bf13c99b80cd365f5c449f3d3b68bdc5..ab98bdc0bfb47e07e5742ac1ee9cebe60f5c7a69 100644
--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
@@ -238,9 +237,9 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
     const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
     const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
@@ -253,10 +252,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::data<paddle::platform::complex64>() const;
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::data<paddle::platform::complex128>() const;
+template PD_DLL_DECL paddle::platform::complex<float>
+    *Tensor::data<paddle::platform::complex<float>>() const;
+template PD_DLL_DECL paddle::platform::complex<double>
+    *Tensor::data<paddle::platform::complex<double>>() const;
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>() const;
 
@@ -268,10 +267,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>();
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>();
+template PD_DLL_DECL paddle::platform::complex<float>
+    *Tensor::mutable_data<paddle::platform::complex<float>>();
+template PD_DLL_DECL paddle::platform::complex<double>
+    *Tensor::mutable_data<paddle::platform::complex<double>>();
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
 
@@ -289,10 +288,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
     const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex64 *
-Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex128 *
-Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex<float> *
+Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
+template PD_DLL_DECL paddle::platform::complex<double> *
+Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
 
@@ -356,13 +355,13 @@ Tensor Tensor::cast(const DataType &target_type) const {
           dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
       break;
     case framework::proto::VarType::COMPLEX64:
-      framework::VisitDataType(
-          dst_type,
-          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
+      framework::VisitDataType(dst_type,
+                               CastDataType<paddle::platform::complex<float>>(
+                                   *tensor, rlt_tensor_, ctx));
       break;
     case framework::proto::VarType::COMPLEX128:
       framework::VisitDataType(dst_type,
-                               CastDataType<paddle::platform::complex128>(
+                               CastDataType<paddle::platform::complex<double>>(
                                    *tensor, rlt_tensor_, ctx));
       break;
     case framework::proto::VarType::FP16:
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index e5dc75e27d4be943bd1339790c51721d0af5651a..bd36d660be427f344dba509f43c124bcbe7e9777 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,7 +27,22 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
+
 proto_library(op_def_proto SRCS op_def.proto)
+cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto)
+
+FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt 
+     "namespace { \n"
+     "const std::unordered_map<std::string, std::string> op_def_map =  { \n")
+foreach(OP_DEF_FILE ${OP_DEF_FILES})
+    FILE(READ ${OP_DEF_FILE}  OP_DEF_CONTENT)
+    get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
+    FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
+    "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
+endforeach(OP_DEF_FILE)
+FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
+
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index a65dcbd55f94630612ce59b4d07b0789aaf7c697..733831263a184f5060cca58c26866ac3350c155c 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -109,9 +109,9 @@ void GroupTestCopy() {
   TestCopyTensor<int8_t>();
   VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<uint8_t>();
-  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  VLOG(2) << "complex<float> cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<paddle::complex64>();
-  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  VLOG(2) << "complex<double> cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<paddle::complex128>();
   VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<paddle::float16>();
@@ -132,9 +132,9 @@ void GroupTestCast() {
   TestCast<uint8_t>(paddle::DataType::FLOAT32);
   VLOG(2) << "float cast";
   TestCast<float>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex64 cast";
+  VLOG(2) << "complex<float> cast";
   TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex128 cast";
+  VLOG(2) << "complex<double> cast";
   TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
   VLOG(2) << "float16 cast";
   TestCast<paddle::float16>(paddle::DataType::FLOAT16);
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 7d005c9690b9486ff8c693d9c14f83853a016ced..f447a00f37c808bafe99b54af4984af9c2af1cfe 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
       platform::errors::Unavailable("Currently, model parallelism is only "
                                     "supported between CPU and CUDA."));
 
+  // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) {
+    TensorCopy(in, dst_place,
+               *platform::DeviceContextPool::Instance().Get(dst_place), out);
+    return;
+  }
+
   // NOTE(yy): TransDataDevice should wait for computation of input.
   if (!platform::is_cuda_pinned_place(in.place())) {
     platform::DeviceContextPool::Instance().Get(in.place())->Wait();
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 6f244ee1713597916961ef8dae4d135d9dc88a56..7b91d545b547b8aabc2f907090f79009225605ec 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -638,25 +638,34 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
 
     const char* str = reader.get();
     std::string line = std::string(str);
-    // VLOG(3) << line;
+
     char* endptr = const_cast<char*>(str);
     int pos = 0;
     for (size_t i = 0; i < use_slots_index_.size(); ++i) {
       int idx = use_slots_index_[i];
       int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE_NE(
-          num, 0,
-          platform::errors::InvalidArgument(
-              "The number of ids can not be zero, you need padding "
-              "it in data generator; or if there is something wrong with "
-              "the data, please check if the data contains unresolvable "
-              "characters.\nplease check this error line: %s, \n Specifically, "
-              "something wrong happened(the length of this slot's feasign is 0)"
-              "when we parse the %d th slots."
-              "Maybe something wrong around this slot"
-              "\nWe detect the feasign number of this slot is %d, "
-              "which is illegal.",
-              str, i, num));
+
+      if (num <= 0) {
+        std::stringstream ss;
+        ss << "\n\nGot unexpected input, maybe something wrong with it.\n";
+        ss << "\n----------------------\n";
+        ss << "The Origin Input Data:\n";
+        ss << "----------------------\n";
+
+        ss << line << "\n";
+
+        ss << "\n----------------------\n";
+        ss << "Some Possible Errors:\n";
+        ss << "----------------------\n";
+        ss << "1. The number of ids can not be zero, you need padding.\n";
+        ss << "2. The input data contains unresolvable characters.\n";
+        ss << "3. We detect the slot " << i << "'s feasign number is " << num
+           << " which is illegal.\n";
+        ss << "\n";
+
+        PADDLE_THROW(platform::errors::InvalidArgument(ss.str()));
+      }
+
       if (idx != -1) {
         (*instance)[idx].Init(all_slots_type_[i]);
         if ((*instance)[idx].GetType()[0] == 'f') {  // float
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 648a32420aa6c0d12545c72ff8cec778d817d7e9..a16f35dc11b8f1525685fe3499cfdce6f9b86968 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -28,8 +26,8 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct bfloat16;
-struct complex128;
-struct complex64;
+template <typename T>
+struct complex;
 struct float16;
 template <typename T>
 struct complex;
@@ -53,35 +51,31 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
   callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
 
-#define _ForEachDataType_(callback)                                            \
-  _ForEachDataTypeHelper_(callback, float, FP32);                              \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);        \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16);       \
-  _ForEachDataTypeHelper_(callback, double, FP64);                             \
-  _ForEachDataTypeHelper_(callback, int, INT32);                               \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                               \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                           \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                           \
-  _ForEachDataTypeHelper_(callback, int8_t, INT8);                             \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
-                          COMPLEX64);                                          \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
-                          COMPLEX128);                                         \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
-
-#define _ForEachDataTypeSmall_(callback)                                       \
-  _ForEachDataTypeHelper_(callback, float, FP32);                              \
-  _ForEachDataTypeHelper_(callback, double, FP64);                             \
-  _ForEachDataTypeHelper_(callback, int, INT32);                               \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
-                          COMPLEX64);                                          \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
-                          COMPLEX128);                                         \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
+#define _ForEachDataType_(callback)                                      \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8);                       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
+                          COMPLEX64);                                    \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
+                          COMPLEX128);
+
+#define _ForEachDataTypeSmall_(callback)                                 \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
+                          COMPLEX64);                                    \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
+                          COMPLEX128);
 
 // For the use of thrust, as index-type elements can be only integers.
 #define _ForEachDataTypeTiny_(callback)          \
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 5a716eba8dbe86e37c1ca1758751f04bdd6c651d..888687c06ce9073108ea5439037da966c45cceda 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -119,12 +119,12 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
   // complex -> real
   switch (src_type) {
     case proto::VarType::COMPLEX64:
-      framework::VisitDataType(dst_type,
-                               CastDataType<platform::complex64>(in, out, ctx));
+      framework::VisitDataType(
+          dst_type, CastDataType<platform::complex<float>>(in, out, ctx));
       break;
     case proto::VarType::COMPLEX128:
       framework::VisitDataType(
-          dst_type, CastDataType<platform::complex128>(in, out, ctx));
+          dst_type, CastDataType<platform::complex<double>>(in, out, ctx));
       break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 829772448eb91e428224647168029395d95ab9f6..f9aa14bf7e8d7e9a632cafefa8b88f0ae35c5a6c 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -159,10 +159,6 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
 #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
                               omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex64 : omp_out += \
-                              omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \
-                              omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex < \
                                   float > : omp_out += omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex < \
@@ -222,58 +218,6 @@ void CheckNanInf<paddle::platform::bfloat16>(
   }
 }
 
-template <>
-void CheckNanInf<paddle::platform::complex64>(
-    const paddle::platform::complex64* value, const size_t numel, int print_num,
-    const std::string& op_type, const std::string& var_name) {
-  float real_sum = 0.0f;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
-
-  float imag_sum = 0.0f;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
-  }
-
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
-        op_type));
-  }
-}
-
-template <>
-void CheckNanInf<paddle::platform::complex128>(
-    const paddle::platform::complex128* value, const size_t numel,
-    int print_num, const std::string& op_type, const std::string& var_name) {
-  double real_sum = 0.0;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
-
-  double imag_sum = 0.0;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
-  }
-
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
-        op_type));
-  }
-}
-
 template <>
 void CheckNanInf<paddle::platform::complex<float>>(
     const paddle::platform::complex<float>* value, const size_t numel,
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 54d8fc92b29459ec062b6809ef4cd5156d50c21a..f1f5ba7789ea6137800e7fcfe2d404ca2d87845b 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -29,9 +29,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
   ::DLDataType dtype;
   if (std::is_same<T, platform::complex<float>>::value ||
-      std::is_same<T, platform::complex<double>>::value ||
-      std::is_same<T, platform::complex64>::value ||
-      std::is_same<T, platform::complex128>::value) {
+      std::is_same<T, platform::complex<double>>::value) {
     // The current dlpack library version is v0.2, and does not define
     // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set
     // dtype.code to 5U directly here. After the dlpack library version being
diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc
index 1a79ada0be7c620eab3e64e8ba600f557af6d39e..8265d105accae0b8a009b1798a6c36053b51ab25 100644
--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -29,9 +29,7 @@ namespace {  // NOLINT
 template <typename T>
 constexpr uint8_t GetDLDataTypeCode() {
   if (std::is_same<T, platform::complex<float>>::value ||
-      std::is_same<T, platform::complex<double>>::value ||
-      std::is_same<T, platform::complex64>::value ||
-      std::is_same<T, platform::complex128>::value) {
+      std::is_same<T, platform::complex<double>>::value) {
     return static_cast<uint8_t>(5);
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 7e7f1fed5ad58db25909c25ca60f5eac80a5f478..16dfc90d27e6a6087a751e0172cbe84e7d377dca 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -50,7 +50,7 @@ if (WITH_TESTING)
 endif(WITH_TESTING)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
 
-cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
+cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
@@ -171,7 +171,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
     cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
     cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
-    set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context)
+    set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function)
 if (WITH_GPU OR WITH_ROCM)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
 endif()
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index bc1be79d1b1688690965bf772c011d774ae1da78..656d453d4030439f0229492a7c2ab2ee46481950 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
-
 #include <string>
+#include "paddle/fluid/framework/op_proto_maker.h"
 
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,6 +23,65 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+FCFusePass::FCFusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+
+  AddOpCompat(OpCompat("relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+
+  AddOpCompat(OpCompat("fc"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("in_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"relu", ""})
+      .End();
+}
+
 void FCFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -52,6 +111,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
       LOG(WARNING) << "The subgraph is empty.";
       return;
     }
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
 
     VLOG(4) << "handle FC fuse";
     GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
@@ -159,6 +222,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     }
     desc.Flush();
 
+    if (!IsCompat(desc)) {
+      LOG(WARNING) << "Fc fuse pass in out fc op compat failed.";
+      return;
+    }
+
     auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
     if (with_relu) {
       GraphSafeRemoveNodes(
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h
index f564bbb151854fe325975285b18d25b517336014..21ef17b65dc2cb8b630155693024b706864f64d5 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -30,6 +30,7 @@ class Graph;
 
 class FCFusePass : public FusePassBase {
  public:
+  FCFusePass();
   virtual ~FCFusePass() {}
 
  protected:
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
index a8c0973cac488ceb96249a898e819af7565c6c7a..5434678ccb04ac9a2a3b3e722d3f0c0f9b1ff5c3 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -15,4 +15,4 @@ cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_
 
 cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
 
-cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
+cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function)
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
index b056c3b07a2f65bf0756285857edd3355b591c29..3d8e655c5b2730fd36651c67d2f7c37b7dd5ecd9 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_def_api.h"
 #include "paddle/fluid/framework/op_info.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -50,18 +53,17 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
   return *this;
 }
 
-//! Todo: append the definition.
 AttrCompat& AttrCompat::IsLeftDefault() {
   const std::string& op_name = op_compat_->Name();
   if (!OpInfoMap::Instance().Has(op_name)) {
-    VLOG(3) << "Op (" << op_name << ") is not registered!";
+    LOG(WARNING) << "Op (" << op_name << ") is not registered!";
     conditions_.emplace_back([](const Attribute& attr) { return false; });
     return *this;
   }
   const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
   const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
   if (attrs.find(attr_name_) == attrs.end()) {
-    VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_;
+    LOG(WARNING) << "Op (" << op_name << ") has no default attr:" << attr_name_;
     conditions_.emplace_back([](const Attribute& attr) { return false; });
   } else {
     Attribute default_attr = attrs.at(attr_name_);
@@ -77,6 +79,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) {
     return true;
   }
   if (!op_desc.HasAttr(attr_name_)) {
+    if (!optional_) {
+      LOG(WARNING) << "The non-optional Attr(" << attr_name_ << ") of Op ("
+                   << op_compat_->Name() << ") not find ! ";
+    }
     return optional_;
   }
   const Attribute attr = op_desc.GetAttr(attr_name_);
@@ -149,19 +155,35 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
 }
 
 bool OpCompat::Judge(const OpDesc& op_desc) {
+  if (is_first_judge_) {
+    is_first_judge_ = false;
+    const proto::OpDef& op_def = GetOpDef(op_name_);
+    if (op_def.has_extra()) {
+      for (const proto::OpDef_AttrDef& attr : op_def.extra().attrs()) {
+        extra_attrs_.emplace(attr.name());
+      }
+    }
+  }
+
   for (auto& attr_map : op_desc.GetAttrMap()) {
     if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
+      if (extra_attrs_.find(attr_map.first) != extra_attrs_.end()) {
+        continue;
+      }
       if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
-        VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_
-                << ") not reigistered in OpCompat, not equal to default value!";
+        LOG(WARNING)
+            << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+            << ") not reigistered in OpCompat, not in extra attribute, not "
+               "equal to default value!";
         return false;
       }
     }
   }
+
   for (auto& attr_compat : attr_compats_) {
     if (!attr_compat.second(op_desc)) {
-      VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op("
-              << op_name_ << ") failed!";
+      LOG(WARNING) << " Check the Attr(" << attr_compat.first << ") of Op("
+                   << op_name_ << ") failed!";
       return false;
     }
   }
@@ -170,8 +192,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_desc : inputs_map) {
     if (input_compats_.find(input_desc.first) == input_compats_.end()) {
       if (!input_desc.second.empty()) {
-        VLOG(3) << "The Input (" << input_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+        LOG(WARNING) << "The Input (" << input_desc.first << ") of Operator ("
+                     << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -179,14 +201,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& input_val : input_compats_) {
     if (inputs_map.find(input_val.first) == inputs_map.end()) {
       if (!input_val.second.Optional()) {
-        VLOG(3) << "The No optional Input (" << input_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        LOG(WARNING) << "The No optional Input (" << input_val.first
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!input_val.second(inputs_map.at(input_val.first))) {
-        VLOG(3) << "The Input (" << input_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+        LOG(WARNING) << "The Input (" << input_val.first << ") of Operator ("
+                     << op_name_ << ") compat check failed!";
         return false;
       }
     }
@@ -196,8 +219,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_desc : outputs_map) {
     if (output_compats_.find(output_desc.first) == output_compats_.end()) {
       if (!output_desc.second.empty()) {
-        VLOG(3) << "The Output (" << output_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+        LOG(WARNING) << "The Output (" << output_desc.first << ") of Operator ("
+                     << op_name_ << ") not reigistered in OpCompat!";
         return false;
       }
     }
@@ -205,14 +228,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
   for (auto& output_val : output_compats_) {
     if (outputs_map.find(output_val.first) == outputs_map.end()) {
       if (!output_val.second.Optional()) {
-        VLOG(3) << "The No optional Output (" << output_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+        LOG(WARNING) << "The No optional Output (" << output_val.first
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
         return false;
       }
     } else {
       if (!output_val.second(outputs_map.at(output_val.first))) {
-        VLOG(3) << "The Output (" << output_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+        LOG(WARNING) << "The Output (" << output_val.first << ") of Operator ("
+                     << op_name_ << ") compat check failed!";
         return false;
       }
     }
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
index 3f2ea673d879b8f1ca3ddbed82b6120af5044d47..3aa985c6d46fa262bd4050f63e668c68e55237ac 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -140,6 +140,8 @@ class OpCompat {
   std::unordered_map<std::string, AttrCompat> attr_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
   std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
+  std::unordered_set<std::string> extra_attrs_;
+  bool is_first_judge_ = true;
 };
 
 /**
@@ -203,6 +205,7 @@ class OpCompatSensiblePass : public Pass {
       if (!node_pair.second->IsOp()) continue;
       auto op_type = node_pair.second->Op()->Type();
       if (!op_compat_judgers_.count(op_type)) {
+        LOG(WARNING) << op_type << "compat not registered!";
         return false;
       }
       auto& judger = *op_compat_judgers_.at(op_type);
diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
index 0878e4d9890d35bc4ecdf276880b43e9c5f4f416..87e28ae3a3aadda63ef67c82596d20cfb0c644f4 100644
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -27,7 +27,6 @@ TEST(OpCompatSensiblePass, compatOp) {
   compat.AddAttr("in_num_col_dims")
       .IsIntIn({1, 2})
       .IsNumLE(1)
-      .IsLeftDefault()
       .End()
       .AddAttr("activation_type")
       .IsStringIn({"tanh", "sigmoid"})
@@ -68,7 +67,7 @@ TEST(OpCompatSensiblePass, compatOp) {
   fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
 
   EXPECT_STREQ(compat.Name().c_str(), "fc");
-  EXPECT_FALSE(compat.Judge(fc_op));
+  EXPECT_TRUE(compat.Judge(fc_op));
 }
 
 TEST(OpCompatSensiblePass, compatOpAttribute) {
@@ -92,6 +91,18 @@ TEST(OpCompatSensiblePass, compatOpAttribute) {
   delete info.checker_;
 }
 
+TEST(OpCompatSensiblePass, opDefNotFound) {
+  OpCompat compat("fc_1");
+
+  OpDesc fc_op;
+
+  compat.Judge(fc_op);
+
+  OpCompat compat_1("");
+
+  compat_1.Judge(fc_op);
+}
+
 TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
   OpCompat compat("fc");
   compat.AddAttr("activation_type")
diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5e758fe4105097e0c6f3032d1d4e150b661ff5f5
--- /dev/null
+++ b/paddle/fluid/framework/op_def_api.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+#include "paddle/fluid/framework/op_def_api.h"
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#ifdef _LINUX
+#include <stdio_ext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#endif
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include "glog/logging.h"
+#include "io/fs.h"
+#include "paddle/fluid/framework/op_def.pb.h"
+
+/*
+// op_def.pbtxt
+namespace {
+ const std::unordered_map<std::string, std::std::string> op_def_map = {...};
+}
+*/
+#include "paddle/fluid/framework/op_def.pbtxt"  //NOLINT
+
+namespace paddle {
+namespace framework {
+
+const proto::OpDef& GetOpDef(const std::string& op_name) {
+  static std::unordered_map<std::string, proto::OpDef> ops_definition;
+  static std::mutex mtx;
+  if (ops_definition.find(op_name) == ops_definition.end()) {
+    std::lock_guard<std::mutex> lk(mtx);
+    if (ops_definition.find(op_name) == ops_definition.end()) {
+      proto::OpDef op_def;
+      if (op_def_map.find(op_name) == op_def_map.end()) {
+        LOG(WARNING) << op_name << ".pbtxt not exist!";
+      } else {
+        if (!::google::protobuf::TextFormat::ParseFromString(
+                op_def_map.at(op_name), &op_def)) {
+          LOG(WARNING) << "Failed to parse " << op_name;
+        }
+      }
+      if (op_def.type() != op_name) {
+        LOG(WARNING) << op_name << ".pbtxt has error type :" << op_def.type();
+        ops_definition.emplace(std::make_pair(op_name, proto::OpDef()));
+      } else {
+        ops_definition.emplace(std::make_pair(op_name, std::move(op_def)));
+      }
+    }
+  }
+  return ops_definition.at(op_name);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/framework/op_def_api.h
similarity index 54%
rename from paddle/fluid/operators/increment_op.cu
rename to paddle/fluid/framework/op_def_api.h
index 228063bf3d4b24bbd03649189f6ddba9a5f0ca30..4ec2089f9b1f88de18305cb5a6615f96f2718d39 100644
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/framework/op_def_api.h
@@ -1,4 +1,4 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/increment_op.h"
+#pragma once
 
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
+#include "paddle/fluid/framework/op_def.pb.h"
+
+namespace paddle {
+namespace framework {
+const proto::OpDef& GetOpDef(const std::string& op_name);
+}
+}
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 593d4d839fa910d2ef81b3ae7483cee4399926cb..348ca5b952bfeab364a5b01ec99e4d0381ab4e84 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -317,8 +317,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+#else
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
+#endif
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 098c576e85d1922e0d96ec12f83c959dc24d6cb9..2f2d7fcffc2ebe63281605a92335c463ef5b9428 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1532,7 +1532,12 @@ Scope* OperatorWithKernel::PrepareData(
   // the rest iterations to save the elapsed time.
   // We do not support skipping PrepareData in while block, because the Op's
   // input may be changed by subsequent Ops, which may cause an error.
-  if (pre_scope_ == &scope && new_scope == nullptr) {
+
+  // For inference, ops that behind conditional branch aren't supported well,
+  // so disable prepare optimization conservatively.
+  bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
+                            Attr<bool>("inference_force_prepare_data");
+  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
     need_prepare_data_ = false;
   }
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 105751645bbc5929dc07e524dcc3e8b52ec52034..32460a98ce511cba152d791f0df9e84fe12f24e7 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -1137,9 +1136,9 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
 }
 
 template <>
-std::ostream& print_tensor<paddle::platform::complex64>(
+std::ostream& print_tensor<paddle::platform::complex<float>>(
     std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<paddle::platform::complex64>();
+  auto inspect = tensor.data<paddle::platform::complex<float>>();
   auto element_num = tensor.numel();
 
   os << "  - data: [";
@@ -1155,9 +1154,9 @@ std::ostream& print_tensor<paddle::platform::complex64>(
 }
 
 template <>
-std::ostream& print_tensor<paddle::platform::complex128>(
+std::ostream& print_tensor<paddle::platform::complex<double>>(
     std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<paddle::platform::complex128>();
+  auto inspect = tensor.data<paddle::platform::complex<double>>();
   auto element_num = tensor.numel();
 
   os << "  - data: [";
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 6b9b411713329ad0f9f663f57a081a7404a5aa7b..57657941ef83f3a3ea0e9e716d49a8b38d22eef8 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -24,8 +24,7 @@
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -200,8 +199,8 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
   PADDLE_TENSOR_ADD(double);
   // NOTE(chenweihang): only support complex grad tensor accumulated,
   // support selected rows if needed in the future
-  PADDLE_TENSOR_ADD(platform::complex64);
-  PADDLE_TENSOR_ADD(platform::complex128);
+  PADDLE_TENSOR_ADD(platform::complex<float>);
+  PADDLE_TENSOR_ADD(platform::complex<double>);
 #endif
 
 #undef PADDLE_TENSOR_ADD
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 1ec692d3d1df66d8c1df689d557b289fc2880b30..e628216a5ed87b42a3f37a1adb86f441a735151e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -270,7 +270,46 @@ bool AnalysisPredictor::CreateExecutor() {
   executor_.reset(new paddle::framework::NaiveExecutor(place_));
   return true;
 }
+
+static bool IsPrepareDataOptTargetOp(framework::OpDesc *op) {
+  // here is prepare data optimization related bad cases:
+  // let's assume an op behind conditional_block and if conditional_block
+  // chooses branch 1, the op need to call prepare data. else the op don't need
+  // to call prepare data. In running, if predictor chooses branch 2, then
+  // optimization takes effect, later issue is followed if predictor chooses
+  // branch 1, because the op lost chance to prepare data.
+  std::vector<std::string> op_type = {"conditional_block_infer",
+                                      "select_input"};
+  for (const auto &type : op_type) {
+    if (op->Type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+
+static void DisablePrepareDataOpt(
+    std::shared_ptr<framework::ProgramDesc> inference_program, int block,
+    bool pre_disable_opt) {
+  bool disable_opt = false;
+  auto &infer_block = inference_program->Block(block);
+  for (auto *op : infer_block.AllOps()) {
+    if (disable_opt || pre_disable_opt) {
+      op->SetAttr("inference_force_prepare_data", true);
+    }
+    if (op->HasAttr("sub_block")) {
+      int blockID = op->GetBlockAttrId("sub_block");
+      DisablePrepareDataOpt(inference_program, blockID,
+                            disable_opt || pre_disable_opt);
+    }
+    // disable prepare data if unfriendly op is found
+    disable_opt = IsPrepareDataOptTargetOp(op);
+  }
+}
+
 bool AnalysisPredictor::PrepareExecutor() {
+  DisablePrepareDataOpt(inference_program_, 0, false);
+
   executor_->Prepare(sub_scope_, *inference_program_, 0,
                      config_.use_feed_fetch_ops_);
 
@@ -1197,6 +1236,9 @@ USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
+USE_TRT_CONVERTER(reshape);
+USE_TRT_CONVERTER(reduce_sum);
+USE_TRT_CONVERTER(gather_nd);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 3820ac5d7cc24693c388554acea0aad6ab49b83a..2e4a175566a7a100749d14c712e8ef9a89eb6019 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -12,6 +12,9 @@ nv_library(tensorrt_converter
                 affine_channel_op.cc
                 multiclass_nms_op.cc
                 nearest_interp_op.cc
+                reshape_op.cc
+                reduce_op.cc
+                gather_nd_op.cc
            DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 61199724bcfe30dfcfc0e044a54e49b62d3a0936..6bbda6bb29aadbfcf4974e2db5eac65a027a19a5 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
 
   TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                               static_cast<void*>(bias_data), bias_size};
-  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
-                           nv_ksize, weight, bias);
-  PADDLE_ENFORCE_NOT_NULL(layer,
-                          platform::errors::Fatal("TensorRT create conv2d"
-                                                  " layer error."));
+  // In conv2d_transpose and depthwise_conv2d_transpose,
+  // output channels = filter_dims[1] * groups
+  auto* layer = (op_desc.Type() == "conv2d_transpose" ||
+                 op_desc.Type() == "depthwise_conv2d_transpose")
+                    ? fadd_layer(const_cast<nvinfer1::ITensor*>(X),
+                                 n_input * groups, nv_ksize, weight, bias)
+                    : fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output,
+                                 nv_ksize, weight, bias);
+
+  PADDLE_ENFORCE_NOT_NULL(
+      layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
+                                     " layer failed."));
   layer->setStride(nv_strides);
   layer->setPadding(nv_paddings);
   layer->setNbGroups(groups);
@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter {
     ConvertConv2d(
         engine_, op, scope, test_mode,
         [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-            int n_input,                             /* Conv input maps */
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
           auto* layer =
@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter {
     ConvertConv2d(
         engine_, op, scope, test_mode,
         [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-            int n_input,                             /* Deconv output maps */
             nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
             TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
           auto* layer =
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index 66a682db07b91195046d3d11031b8739b72b81c4..04c51202f022f65cf0c6b6a9671248acbc547a4a 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
     auto word_emb_name = op_desc.Input("WordEmbedding").front();
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
-    std::vector<std::string> id_names = {word_id_name, pos_id_name,
-                                         sent_id_name};
-    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
-                                          sent_emb_name};
+
+    std::vector<std::string> id_names;
+    std::vector<std::string> emb_names;
+
+    if (engine_->use_oss()) {
+      id_names =
+          std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+      emb_names =
+          std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+    } else {
+      id_names = op_desc.Input("Ids");
+      emb_names = op_desc.Input("Embs");
+    }
 
     int input_num = id_names.size();
 
diff --git a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..489fc987dfec2a13b4baccb06911c940b627d908
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class GatherNdOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin";
+    framework::OpDesc op_desc(op, nullptr);
+
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> inputs;
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* index = engine_->GetITensor(op_desc.Input("Index")[0]);
+    inputs.emplace_back(input);
+    inputs.emplace_back(index);
+
+    nvinfer1::ILayer* layer = nullptr;
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::GatherNdPluginDynamic* plugin =
+        new plugin::GatherNdPluginDynamic(with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+
+    std::string layer_name = "gather_nd (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index f72ae2c3ec2d7e013247f294a6f3e6dd4572ae35..57a26aec6ebcb3d1350ec560927b76bf1988d64b 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -127,6 +127,13 @@ class OpConverter {
           it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                               op_desc.Type()));
     }
+    // reshape2 == reshape
+    if (op_desc.Type() == "reshape2") {
+      it = Registry<OpConverter>::Global().Lookup("reshape");
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+    }
     if (!it) {
       it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
     }
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66d2680fe9969cf7857130f1aa6e6aef742ca805
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <sys/types.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class ReduceSumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    framework::OpDesc op_desc(op, nullptr);
+
+    auto* x = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims input_shape = x->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim"));
+    std::vector<int32_t> dim =
+        BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
+    bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
+
+    // Now we only support dynamic_shape mode.
+    nvinfer1::IReduceLayer* layer = nullptr;
+    if (reduce_all) {
+      uint32_t reduce_dim = 0;
+      for (int i = 0; i < input_dims; ++i) {
+        reduce_dim |= 1 << i;
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+                                   keep_dim);
+    } else {
+      auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
+        uint32_t res = 0;
+        for (auto x : dims) {
+          if (x < 0) {
+            res |= 1 << (x + input_dims);
+          } else {
+            res |= 1 << x;
+          }
+        }
+        return res;
+      };
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM,
+                                   CvtToBitMask(dim), keep_dim);
+    }
+
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..489603e20cda2f1143fd4791c8cbe5e8e58e4148
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * ReshapeOp
+ */
+class ReshapeOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    std::vector<int> shape =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
+    int nbDims_num = shape.size();
+    nvinfer1::Dims reshape_dim;
+    if (engine_->with_dynamic_shape()) {  // running the TRT Dynamic Shape mode
+      reshape_dim.nbDims = nbDims_num;
+      for (int i = 0; i < nbDims_num; ++i) {
+        reshape_dim.d[i] = shape[i];
+      }
+    } else {  // running the TRT Static Shape mode
+      reshape_dim.nbDims = nbDims_num - 1;
+      for (int i = 0; i < nbDims_num - 1; ++i) {
+        reshape_dim.d[i] = shape[i + 1];
+      }
+    }
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    layer->setReshapeDimensions(reshape_dim);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(reshape, ReshapeOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 9df3ec0445ad1c6c778f81a5e1489096c850c589..6c6a59e98d9e21728a9fbca08de1ffb455b55ccd 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
 
@@ -49,6 +50,10 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7130)
     teller_set.insert("group_norm");
+#endif
+#if CUDA_VERSION >= 10200
+    teller_set.insert("reshape");
+    teller_set.insert("reshape2");
 #endif
   }
 
@@ -118,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller {
       "flatten2",
       "flatten",
       "gather",
+      "gather_nd",
       "yolo_box",
       "roi_align",
       "affine_channel",
       "nearest_interp",
       "anchor_generator",
+      "reduce_sum",
   };
 };
 
@@ -143,19 +150,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
           BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
 
       if (paddings.size() > 2) return false;
-// strides > 1 is only supported by trt7.0 above
-#if !IS_TRT_VERSION_GE(7000)
-      if (desc.HasAttr("strides")) {
-        const std::vector<int> strides =
-            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
-        // there is no issue if strides.size() less than 2
-        if (strides.size() > 1) {
-          for (size_t i = 0; i < strides.size(); i++) {
-            if (strides[i] > 1) return false;
-          }
-        }
-      }
-#endif
     }
 
     if (op_type == "pool2d") {
@@ -239,15 +233,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
 
-// strides > 1 is only supported by trt7.0 above
+// strides > 1 and 'SAME' is only supported by trt7.0 above
 #if !IS_TRT_VERSION_GE(7000)
-      if (desc.HasAttr("strides")) {
-        const std::vector<int> strides =
-            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
-        // there is no issue if strides.size() less than 2
-        if (strides.size() > 1) {
-          for (size_t i = 0; i < strides.size(); i++) {
-            if (strides[i] > 1) return false;
+      if (op_type == "conv2d" || op_type == "conv2d_fusion" ||
+          op_type == "depthwise_conv2d") {
+        if (desc.HasAttr("padding_algorithm") && with_dynamic_shape) {
+          auto padding_algorithm =
+              BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm"));
+          if (padding_algorithm == "SAME" && desc.HasAttr("strides")) {
+            const std::vector<int> strides =
+                BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+            // there is no issue if strides.size() less than 2
+            if (strides.size() > 1) {
+              for (size_t i = 0; i < strides.size(); i++) {
+                if (strides[i] > 1) return false;
+              }
+            }
           }
         }
       }
@@ -326,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
 
+    if (op_type == "gather_nd") {
+      auto* block = desc.Block();
+      auto x_var_name = desc.Input("X")[0];
+      auto index_var_name = desc.Input("Index")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      auto* index_var_desc = block->FindVar(index_var_name);
+
+      // The index input must be int32 datatype.
+      if (index_var_desc->GetDataType() !=
+          paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+        VLOG(3) << "gather_nd op Index input data type must be int32";
+        return false;
+      }
+
+      const auto index_shape = index_var_desc->GetShape();
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != index_shape.size()) {
+        VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size()
+                << " ] not equal to x dims size [" << x_shape.size() << "]";
+        return false;
+      }
+      if (!with_dynamic_shape) return false;
+    }
+
     if (op_type == "yolo_box") {
       if (with_dynamic_shape) return false;
       bool has_attrs =
@@ -673,6 +698,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
         return false;
       }
     }
+    if (op_type == "reshape" || op_type == "reshape2") {
+      if (!desc.HasAttr("shape")) {
+        return false;
+        // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
+      } else if (desc.Input("Shape").size() >= 1 ||
+                 desc.Input("ShapeTensor").size() >= 1) {
+        return false;
+      } else {
+        std::vector<int> shape =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("shape"));
+        if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false;
+      }
+    }
+
+    if (op_type == "reduce_sum") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the reduce_sum does not support static shape yet";
+        return false;
+      }
+
+      if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
+            desc.HasAttr("reduce_all"))) {
+        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+                   "reduce_all)";
+        return false;
+      }
+    }
 
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 1804e6c5571d3a15b0b9adc67dc535b46635caa8..26125d21ad7d1a8e9348ce547494289b3a7dd6ad 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin
            anchor_generator_op_plugin.cu
            yolo_box_op_plugin.cu
            roi_align_op_plugin.cu
+           gather_nd_op_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5f4ac054c95b34f305d0213d773f5bcfe5e9e3e9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
@@ -0,0 +1,229 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_fp16.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <sstream>
+
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+
+template <typename T, typename IndexT = int>
+__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
+                                   const IndexT* indices, T* output,
+                                   int32_t remain_size, int32_t slice_size,
+                                   int32_t end_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = 0;
+    int32_t temp = slice_size;
+    for (int32_t j = end_size - 1; j >= 0; --j) {
+      auto index_value = indices[indices_i * end_size + j];
+      PADDLE_ENFORCE(
+          index_value >= 0 && index_value < input_dims[j],
+          "The index is out of bounds, "
+          "please check whether the dimensions of index and "
+          "input meet the requirements. It should "
+          "be less than [%d] and greater or equal to 0, but received [%d]",
+          input_dims[j], index_value);
+      gather_i += (index_value * temp);
+      temp *= input_dims[j];
+    }
+    IndexT input_i = gather_i + slice_i;
+    *(output + i) = *(input + input_i);
+  }
+}
+
+int GatherNdPluginDynamic::initialize() { return 0; }
+
+size_t GatherNdPluginDynamic::getSerializationSize() const {
+  return SerializedSize(with_fp16_);
+}
+
+void GatherNdPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, with_fp16_);
+}
+
+nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 2,
+      platform::errors::InvalidArgument(
+          "The gather_nd plugin should have 2 input, but got %d.", nb_inputs));
+  PADDLE_ENFORCE_EQ(output_index, 0,
+                    platform::errors::InvalidArgument(
+                        "When GetOutputDimensions in gather_nd "
+                        "plugin, the output_index should be 0."));
+
+  nvinfer1::DimsExprs x_dims = inputs[0];
+  nvinfer1::DimsExprs index_dims = inputs[1];
+
+  int32_t x_dims_size = x_dims.nbDims;
+  int32_t index_dims_size = index_dims.nbDims;
+
+  // TODO(wilber): The result dims shoule be Index.shape[:-1] +
+  // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
+  // get the actual value. So we only support one scenario: input_dims.size ==
+  // index_dims.size.
+  nvinfer1::DimsExprs ret(x_dims);
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    ret.d[i] = index_dims.d[i];
+  }
+
+  return ret;
+}
+
+bool GatherNdPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of gather_nd plugin should not be nullptr."));
+
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  } else if (pos == 1) {
+    return in.type == nvinfer1::DataType::kINT32 &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  } else if (pos == 2) {
+    return in.type == in_out[0].type &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  }
+
+  return true;
+}
+
+nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  return input_types[0];
+}
+
+int GatherNdPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;
+  auto index_dims = input_desc[1].dims;
+  auto input_dims_size = input_dims.nbDims;
+  auto index_dims_size = index_dims.nbDims;
+
+  std::vector<int32_t> input_shape, index_shape, out_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  for (int i = 0; i < index_dims.nbDims; i++)
+    index_shape.push_back(index_dims.d[i]);
+  // The out_shape is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    out_shape.emplace_back(index_shape[i]);
+  }
+  for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) {
+    out_shape.emplace_back(input_shape[i]);
+  }
+
+  // final dim
+  int end_size = index_shape[index_dims_size - 1];
+  // remain dim
+  std::vector<int> remain_ddim(index_shape.begin(), index_shape.end() - 1);
+  int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1,
+                                     std::multiplies<int>());
+  // slice size
+  int slice_size = 1;
+  for (int i = end_size; i < input_dims_size; ++i) {
+    slice_size *= input_shape[i];
+  }
+
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32";
+
+    const float* p_input = static_cast<const float*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    float* p_output = static_cast<float*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<float, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16";
+
+    const half* p_input = static_cast<const half*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    half* p_output = static_cast<half*>(outputs[0]);
+
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+
+    GatherNdCUDAKernel<half, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  }
+
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a242238c81fb3b34888905a393bc992179712b2
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class GatherNdPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; }
+
+  GatherNdPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  }
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new GatherNdPluginDynamic(with_fp16_);
+  }
+
+  const char* getPluginType() const override { return "gather_nd_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void destroy() override {
+    if (input_dims_data_) {
+      cudaFree(input_dims_data_);
+    }
+    delete this;
+  }
+
+ private:
+  int32_t* input_dims_data_{nullptr};
+};
+
+class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GatherNdPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "gather_nd_plugin"; }
+
+  const char* getPluginVersion() const override { return "1"; }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index a5f075b8dc68c26a034a667f5bdb2a26c224c24c..07208d016a79083079707e38dd0207b4d1c282a2 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -8,44 +8,84 @@ if(WITH_GPU AND TENSORRT_FOUND)
     set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps})
 endif()
 
-function(download_data install_dir data_file)
+function(download_data install_dir data_file check_sum)
     string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
     if (NOT EXISTS ${install_dir}/${file_name})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_int8_data install_dir data_file)
+function(download_data_without_verify install_dir data_file)
+    string(REGEX MATCH "[^/\\]+$" file_name ${data_file})
+    if (NOT EXISTS ${install_dir}/${file_name})
+        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL} ${data_file})
+    endif()
+endfunction()
+
+function(download_int8_data install_dir data_file check_sum)
+    if (NOT EXISTS ${install_dir}/${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
+    endif()
+endfunction()
+
+function(download_int8_data_without_verify install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+    endif()
+endfunction()
+
+function(download_bfloat16_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_bfloat16_data install_dir data_file)
+function(download_bfloat16_data_without_verify install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file})
+        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file})
     endif()
 endfunction()
 
-function(download_GRU_data install_dir data_file)
+function(download_GRU_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_data install_dir data_file)
+function(download_GRU_data_without_verify install_dir data_file)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+        inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/gru ${data_file})
     endif()
 endfunction()
 
-function(download_model_and_data install_dir model_name data_name)
-    download_data(${install_dir} ${model_name}) 
-    download_data(${install_dir} ${data_name})
+function(download_quant_data install_dir data_file check_sum)
+    if (NOT EXISTS ${install_dir}/${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
+    endif()
+endfunction()
+
+function(download_quant_data_without_verify install_dir data_file)
+    if (NOT EXISTS ${install_dir}/${data_file})
+	    inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+    endif()
+endfunction()
+
+function(download_model_and_data install_dir model_name model_check_sum data_name data_check_sum)
+    download_data(${install_dir} ${model_name} ${model_check_sum}) 
+    download_data(${install_dir} ${data_name} ${data_check_sum})
+endfunction()
+
+function(download_model_and_data_without_verify install_dir model_name data_name)
+    download_data_without_verify(${install_dir} ${model_name}) 
+    download_data_without_verify(${install_dir} ${data_name})
+endfunction()
+
+function(download_result install_dir result_name check_sum)
+    download_data(${install_dir} ${result_name} ${check_sum})
 endfunction()
 
-function(download_result install_dir result_name)
-    download_data(${install_dir} ${result_name})
+function(download_result_without_verify install_dir result_name)
+    download_data_without_verify(${install_dir} ${result_name})
 endfunction()
 
 function(inference_analysis_api_test target install_dir filename)
@@ -165,12 +205,12 @@ endfunction()
 if(NOT APPLE AND WITH_MKLML)
     # RNN1
     set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
-    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
+    download_model_and_data_without_verify(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
     
     # seq_pool1
     set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool")
-    download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
+    download_model_and_data_without_verify(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz")
     inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc)
     inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc)
@@ -193,7 +233,7 @@ endif()
 
 # RNN2
 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
-download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
+download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
 
 # TODO(luotao, Superjom) Disable DAM test, temporarily fix
@@ -201,12 +241,12 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
 # After inference framework refactor, will reopen it.
 # normal DAM
 set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
-download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
+download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
 #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator)
 
 # small DAM
 set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
-download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
+download_model_and_data_without_verify(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt)
@@ -216,29 +256,29 @@ inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} an
 
 # chinese_ner
 set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
-download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
+download_model_and_data_without_verify(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)
 
 # lac
 set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
-download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
+download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5 "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd)
 inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)
 
 # Pyramid DNN
 set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn")
-download_model_and_data(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
+download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc)
 
 #Ernie
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie")
-download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" "Ernie_data.txt.tar.gz" "Ernie_result.txt.tar.gz")
-download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz")
+download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1)
+download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62)
 inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc)
 
 #Ernie large
 set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large")
-download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" "Ernie_large_data.txt.tar.gz" "Ernie_large_result.txt.tar.gz")
-download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz")
+download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f)
+download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73)
 inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc
     EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
     ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true)
@@ -251,17 +291,17 @@ endif()
 
 # text_classification
 set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" 3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz" 36ae620020cc3377f45ed330dd36238f)
 inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
 
 # seq_conv1
 set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
-download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
+download_model_and_data_without_verify(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
 
 # transformer, the dataset only works on batch_size=8 now
 set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
-download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
+download_model_and_data_without_verify(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
   ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
@@ -278,13 +318,13 @@ inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transfor
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz)
-    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz")
+    inference_download_and_uncompress_without_verify(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
 
 # densebox
 set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox")
-download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
+download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
 #inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc 
 #  EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
 #  ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt 
@@ -294,7 +334,7 @@ download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz")
 # mobilenet with transpose op
 set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet")
 if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz)
-    inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz")
+    inference_download_and_uncompress_without_verify(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz")
 endif()
 inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
@@ -307,13 +347,13 @@ inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLA
 
 # googlenet
 set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet")
-download_data(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz")
+download_data_without_verify(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP}
 	${GOOGLENET_MODEL_DIR} false)
 
 # resnet50
 set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50")
-download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
+download_data_without_verify(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP}
 	${RESNET50_MODEL_DIR} true)
 if (WIN32)
@@ -323,7 +363,7 @@ endif()
 
 # mobilenet with depthwise_conv op
 set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv")
-download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
+download_data_without_verify(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz")
 inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP}
 	${MOBILENET_MODEL_DIR} false)
 
@@ -340,7 +380,7 @@ if(WITH_MKLDNN)
   set(IMAGENET_DATA_ARCHIVE "imagenet_val_100_tail.tar.gz")
   set(IMAGENET_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/imagenet")
   set(IMAGENET_DATA_PATH "${IMAGENET_DATA_DIR}/data.bin")
-  download_int8_data(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE})
+  download_int8_data_without_verify(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE})
 
   # build test binary to be used in subsequent tests
   set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification")
@@ -349,40 +389,40 @@ if(WITH_MKLDNN)
 
   # resnet50 int8
   set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
-  download_int8_data(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # mobilenetv1 int8
   set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1")
-  download_int8_data(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # mobilenetv2 int8
   set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2")
-  download_int8_data(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # resnet101 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101")
-  download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" )
 #   inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # vgg16 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16")
-  download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" )
 #   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH})
  
   # vgg19 int8
   # TODO(grygielski) Enable after MKL-DNN 1.0 merge
   set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19")
-  download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" )
 #   inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH})
 
   # googlenet int8
   set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet")
-  download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" )
   inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10)
 
   ### BFLOAT16 tests
@@ -410,7 +450,7 @@ if(WITH_MKLDNN)
   set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc")
 
   # download dataset if necessary
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
+  download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz")
 
 
   # build test binary to be used in subsequent tests
@@ -418,13 +458,13 @@ if(WITH_MKLDNN)
 
   # mobilenet-ssd int8
   set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd")
-  download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
+  download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" )
   inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH})
 
   ### Lexcial analysis GRU model
   set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz")
-  download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
+  download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz")
+  download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz")
   set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin")
   set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2")
   set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis")
@@ -451,9 +491,9 @@ if(WITH_MKLDNN)
   set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2")
   set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8")
   if(NOT LINUX)
-      download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
+      download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz")
   endif(NOT LINUX)
-  download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
+  download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz")
   inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH})
 
   ### Other tests
@@ -465,13 +505,13 @@ if(WITH_MKLDNN)
   inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP})
 
   # preprocess data2bin imagenet
-    download_int8_data(${INT8_DATA_DIR} "imagenet_small.tar.gz")
+    download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz")
     set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small")
     set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin")
     preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE})
     
   # preprocess data2bin pascalvoc
-  download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
+  download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_small.tar.gz")
   set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small")
   set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin")
   preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE})
@@ -480,26 +520,26 @@ endif()
 
 # bert, max_len=20, embedding_dim=128
 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128")
-download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
+download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc)
 
 # multiple models prediction
 set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction")
-download_data(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) 
+download_data_without_verify(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) 
 inference_multiple_models_analysis_api_test(test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR} analyzer_mmp_tester.cc)
 
 if(WITH_GPU AND TENSORRT_FOUND)
     set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models")
     if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz)
-        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz")
+        inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98)
     endif()
     set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test")
     if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz)
-        inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
+        inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz")
     endif()
     set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test")
     if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz)
-        inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
+        inference_download_and_uncompress_without_verify(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz")
     endif()
     inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -531,7 +571,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             
     set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model")
     if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz)
-        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
+        inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -539,7 +579,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 
     set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware")
     if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz)
-        inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
+        inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz")
     endif()
     inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -547,12 +587,12 @@ if(WITH_GPU AND TENSORRT_FOUND)
 
     set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic")
     if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
+        inference_download_and_uncompress_without_verify(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz")
     endif()
 
     set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu")
     if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz")
+        inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6)
     endif()
     inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
@@ -560,7 +600,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 
     set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test")
     if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz")
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68)
     endif()
 
     inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
@@ -569,7 +609,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 
     set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
     if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
-        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
+        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471)
     endif()
 
     inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
@@ -577,7 +617,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
     if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55)
     endif()
 
     inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc
@@ -585,7 +625,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
 
     if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz)
-        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz")
+        inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4)
     endif()
 
     inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc
@@ -595,7 +635,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
-download_data(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz")
+download_data_without_verify(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz")
 
 inference_analysis_test(lite_mul_model_test SRCS lite_mul_model_test.cc
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 41b78d39a2594cbe39bc0d0defef7a24047674dc..05c468b798886ac135ed30bff75ce9400f1ca3a1 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -23,7 +23,30 @@ function(inference_download INSTALL_DIR URL FILENAME)
   )
 endfunction()
 
-function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
+function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      URL_HASH              MD5=${CHECK_SUM}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+  )
+endfunction()
+
+function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
   string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
   string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
@@ -47,13 +70,13 @@ endfunction()
 
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
 if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
-  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+  inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 
 set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
 if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz)
-  inference_download_and_uncompress(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
+  inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6e11c64afc4bd813362640e151203d4dd700fea5..e645b379f3c06ae2e83c93b6f1a4d56f57f99d57 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -115,9 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 if (WITH_GPU OR WITH_ROCM)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
@@ -171,7 +171,7 @@ endif()
 
 if (WITH_ASCEND_CL)
   cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op)
 endif()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu
index d03de7a45628a4cd4045e6fbc2965060b3486cfe..b0eba229fde51841542b5d8d1d73330b40bd29f0 100644
--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -14,8 +14,6 @@
 
 #include "paddle/fluid/operators/abs_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 055909ba6f486ff82220c2d36c54687091bde9ed..47618114a85ff1cd4f9455793ca4c63afe260558 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
   }
 };
 
+template <typename T>
+class SigmoidDoubleGradMaker
+    : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("sigmoid_grad_grad");
+    // input1: Out
+    op->SetInput("Out", this->Input("Out"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DOutNew", this->InputGrad("Out"));
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
+
 template <typename T>
 class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
  public:
@@ -1068,6 +1089,47 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
 
+/* ==========================    sigmoid register  =============================
+ */
+// 1. Register Sigmoid Operator
+REGISTER_OPERATOR(
+    sigmoid, ops::ActivationOp, ops::SigmoidOpMaker,
+    ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::SigmoidGradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+
+// 2. Register Sigmoid Grad Operator
+REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::SigmoidDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::SigmoidDoubleGradMaker<paddle::imperative::OpBase>)
+
+// 3. Register Sigmoid DoubleGrad Operator
+REGISTER_OPERATOR(
+    sigmoid_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::SigmoidGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+
+// Register Sigmoid/GradSigmoid Kernels
+REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
+                               SigmoidGradFunctor);
+
+// Register DoubleGrad Kernel
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad_grad,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<float>>,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<double>>,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+
+/* ========================================================================== */
+
 /* ==========================    tanh register  ============================= */
 REGISTER_OPERATOR(
     tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index 87e65e8817798199c907f88692887a21da58673c..c94510c9dfe5235187565a4178216503fcef6275 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL(
 #endif
 /* ========================================================================== */
 
+/* ===========================    sigmoid register  ============================
+ */
+REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                                CudaSigmoidGradFunctor);
+
+REGISTER_OP_CUDA_KERNEL(
+    sigmoid_grad_grad,
+    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<float>>,
+    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<double>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
+
 /* ===========================    tanh register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
                                 CudaTanhGradFunctor);
@@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor);      \
   __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
   __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
           CudaLogSigmoidGradFunctor);                                         \
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ccd5bf528ba58ca731513a1a1fafce3f2f64c470..3bdf3f34721b039b1a64794c1b9932e1c7d8e834 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, const framework::Tensor* dOut,
+                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+
+    if (dOutNew) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
 // silu(x) = x / (1 + exp(-x))
 template <typename T>
 struct SiluFunctor : public BaseActivationFunctor<T> {
@@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut(
   }
 }
 
+template <typename DeviceContext, typename Functor>
+class SigmoidDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *Out, *ddX, *dOut;
+    framework::Tensor *dOutNew, *ddOut;
+    Out = ddX = dOut = nullptr;
+    dOutNew = ddOut = nullptr;
+
+    // extract ddx(input) and out(input)
+    ddX = ctx.Input<framework::Tensor>("DDX");
+    Out = ctx.Input<framework::Tensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ddX, platform::errors::NotFound(
+                 "Cannot get input Variable ddX, variable name = %s",
+                 ctx.InputName("DDX")));
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, platform::errors::NotFound(
+                 "Cannot get input Variable Out, variable name = %s",
+                 ctx.InputName("Out")));
+
+    // set output ddout
+    ddOut = ctx.Output<framework::Tensor>("DDOut");
+
+    // extract dOut(intput)
+    dOut = ctx.Input<framework::Tensor>("DOut");
+    PADDLE_ENFORCE_NOT_NULL(
+        dOut, platform::errors::NotFound(
+                  "Cannot get input Variable dOut, variable name = %s",
+                  ctx.InputName("DOut")));
+
+    // set output dout_new
+    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
+
+    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    auto& place = ctx.template device_context<DeviceContext>();
+    Functor functor;
+    functor(place, Out, ddX, dOut, dOutNew, ddOut);
+  }
+};
+
 template <typename DeviceContext, typename Functor>
 class TanhDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace paddle
 
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
   __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
   __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
   __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index f368c658230555c5a3529b39dfc1b60b1cab56e4..cb3d85c1368bc4ffacf20aa24fa2722b56925186 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Power", {*x}, {*out},
-                              {{"power", factor},
-                               {"scale", static_cast<float>(1.0)},
-                               {"shift", static_cast<float>(0.0)}});
+    const auto& runner = NpuOpRunner("Power", {*x}, {*out},
+                                     {{"power", factor},
+                                      {"scale", static_cast<float>(1.0)},
+                                      {"shift", static_cast<float>(0.0)}});
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // Step1: Compute x_pow = x.pow(factor-1)
     Tensor x_pow(x->type());
     x_pow.mutable_data<T>(x->dims(), place);
-    auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
-                                  {{"power", factor - static_cast<float>(1)}});
+    const auto& runner_pow = NpuOpRunner(
+        "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
     runner_pow.Run(stream);
 
     // Step 2: Construct a broadcast factor, which has the same shape with x.
@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
     // factor.
     Tensor factor_bc_tensor(framework::proto::VarType::FP32);
     factor_bc_tensor.mutable_data<float>(x_dims, place);
-    auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
-                                 {{"dims", framework::vectorize(x_dims)}});
+    const auto& runner_bc =
+        NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+                    {{"dims", framework::vectorize(x_dims)}});
     runner_bc.Run(stream);
 
     // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
     Tensor x_power_mul_factor(x->type());
     x_power_mul_factor.mutable_data<T>(x->dims(), place);
-    auto runner_mul_1 =
+    const auto& runner_mul_1 =
         NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
     runner_mul_1.Run(stream);
 
     // Step 4: Compute dx = dout * factor * x.pow(factor-1)
     dx->mutable_data<T>(place);
-    auto runner_mul_2 =
+    const auto& runner_mul_2 =
         NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
     runner_mul_2.Run(stream);
   }
@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Relu",
-                              {
-                                  *x,
-                              },
-                              {*out}, {});
+    const auto& runner = NpuOpRunner("Relu",
+                                     {
+                                         *x,
+                                     },
+                                     {*out}, {});
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     dx->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
+    const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
 
     runner.Run(stream);
   }
@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
     runner.Run(stream);
   }
 };
@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
-    dx_runner.Run(stream);
+    const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
+    runner_dx.Run(stream);
   }
 };
 
@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> {
 
     Tensor one(x->type());
     one.mutable_data<T>(x->dims(), place);
-    auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
-    one_runner.Run(stream);
+    const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
+    runner_one.Run(stream);
 
     Tensor sub(x->type());
     sub.mutable_data<T>(x->dims(), place);
-    auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
-    sub_runner.Run(stream);
+    const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
+    runner_sub.Run(stream);
 
-    auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
-    out_runner.Run(stream);
+    const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
+    runner_out.Run(stream);
   }
 };
 
@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
+    const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
     runner.Run(stream);
   }
 };
@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
     runner.Run(stream);
   }
 };
@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
-    dx_runner.Run(stream);
+    const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
+    runner_dx.Run(stream);
   }
 };
 
@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
index fe5b08af52a624b29100635ee34cfac7c2d2a859..82436bdef16bcf59baeac2054f3cce3fd9a54047 100644
--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
     auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
     float_status->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    const auto& runner =
+        NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
index 8fd45326e4ec6134cf4b98be12212ce8d7d74541..26280cd2bd1d32fedaa01d0b638fdcc89749bb76 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
 
     found_inf->mutable_data<bool>(ctx.GetPlace());
 
-    bool found_inf_data = false;
-
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    // step1: inverse scale(RealDiv)
+    // step1: inverse scale
     Tensor const_tensor;
     const_tensor.mutable_data<T>({1}, ctx.GetPlace());
     FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
@@ -58,7 +56,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     Tensor inverse_out(scale->type());
     inverse_out.Resize(scale->dims());
     inverse_out.mutable_data<T>(ctx.GetPlace());
-    auto runner_inverse =
+    const auto& runner_inverse =
         NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
     runner_inverse.Run(stream);
     tmp_inverse_out = &inverse_out;
@@ -66,55 +64,41 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
     // NOTE(zhiqiu):
     Tensor tmp;
     tmp.mutable_data<float>({8}, ctx.GetPlace());
-
     // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
     // tmp is only placeholder.
-    auto runner_float_status =
+    const auto& runner_float_status =
         NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
                     {{"message", std::string("check_nan_and_inf")}});
     runner_float_status.Run(stream);
 
     Tensor sum;
     sum.mutable_data<float>({1}, ctx.GetPlace());
-    auto runner_reduce_sum =
+    const auto& runner_reduce_sum =
         NpuOpRunner("ReduceSumD", {*float_status}, {sum},
                     {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
     runner_reduce_sum.Run(stream);
 
-    std::vector<float> sum_vec;
-    TensorToVector(
-        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
-        &sum_vec);
-    found_inf_data = (sum_vec[0] > 1);
-
-    VLOG(4) << "found_inf_data:" << found_inf_data;
-
+    const auto& runner_greater =
+        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
+    runner_greater.Run(stream);
+
+    // NOTE(zhiqiu): The normal logic is :
+    // out = in, if found_inf = true
+    // out = in/scale, if found_inf = false
+    // However, on NPU, in order to avoid stream sync, we do not copy the
+    // found_inf data to cpu to check whether to unscale or not.
+    // Instead, we do the Mul no matter found_inf or not.
+    // And, a fact is, only few steps contains nan/inf during training.
     for (size_t i = 0; i < xs.size(); ++i) {
       const auto* x = xs[i];
       auto* out = outs[i];
       out->mutable_data<T>(ctx.GetPlace());
-      if (!found_inf_data) {
-        // MatMul
-        auto runner_matmul =
-            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-        runner_matmul.Run(stream);
-      }
+      const auto& runner_mul =
+          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
+      runner_mul.Run(stream);
     }
 
-    // set found_inf to true
-    VLOG(4) << "found overflow:" << found_inf_data;
-    Tensor found_inf_tensor;
-    found_inf_tensor.Resize({1});
-    bool* is_found_inf =
-        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-    *is_found_inf = found_inf_data;
-
-    framework::TensorCopy(
-        found_inf_tensor, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), found_inf);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-
-    auto runner_clear_status =
+    const auto& runner_clear_status =
         NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
     runner_clear_status.Run(stream);
   }
diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
index 820966addfcff31d1676aedd71101a2e3c5a4332..6db18c46a09b85e08ffecc14ce86f8f20bb7713e 100644
--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx,
     Tensor factor_tensor(bad_out_tensor->type());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
-                                 {*bad_out_tensor}, {});
+    const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
+                                        {*bad_out_tensor}, {});
     runner_p2.Run(stream);
 
     std::vector<int> bad_out_data;
     TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
     if (bad_out_data[0] == decr_every_n_nan_or_inf) {
-      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                   {*updated_loss_scaling_tensor},
-                                   {{"power", static_cast<float>(1)},
-                                    {"scale", decr_ratio},
-                                    {"shift", static_cast<float>(0)}});
+      const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                          {*updated_loss_scaling_tensor},
+                                          {{"power", static_cast<float>(1)},
+                                           {"scale", decr_ratio},
+                                           {"shift", static_cast<float>(0)}});
 
       runner_p3.Run(stream);
 
@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx,
       TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
       if (new_loss_scaling[0] < static_cast<T>(1)) {
         // updated_loss_scaling_data = 1
-        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                     {*updated_loss_scaling_tensor},
-                                     {{"power", static_cast<float>(1)},
-                                      {"scale", static_cast<float>(0)},
-                                      {"shift", static_cast<float>(1)}});
+        const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                            {*updated_loss_scaling_tensor},
+                                            {{"power", static_cast<float>(1)},
+                                             {"scale", static_cast<float>(0)},
+                                             {"shift", static_cast<float>(1)}});
 
         runner_p4.Run(stream);
       }
@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx,
     Tensor factor_tensor(good_out_tensor->type());
     factor_tensor.mutable_data<int>({1}, place);
     FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
-                                 {*good_out_tensor}, {});
+    const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
+                                        {*good_out_tensor}, {});
     runner_p2.Run(stream);
 
     std::vector<int> good_out_data;
     TensorToVector(*good_out_tensor, ctx, &good_out_data);
 
     if (good_out_data[0] == incr_every_n_steps) {
-      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                   {*updated_loss_scaling_tensor},
-                                   {{"power", static_cast<float>(1)},
-                                    {"scale", incr_ratio},
-                                    {"shift", static_cast<float>(0)}});
+      const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                          {*updated_loss_scaling_tensor},
+                                          {{"power", static_cast<float>(1)},
+                                           {"scale", incr_ratio},
+                                           {"shift", static_cast<float>(0)}});
       runner_p3.Run(stream);
 
       std::vector<T> new_loss_scaling;
       TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
       if (!std::isfinite(new_loss_scaling[0])) {
         // updated_loss_scaling_data = pre_loss_scaling_data
-        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                     {*updated_loss_scaling_tensor},
-                                     {{"power", static_cast<float>(1)},
-                                      {"scale", static_cast<float>(1)},
-                                      {"shift", static_cast<float>(0)}});
+        const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+                                            {*updated_loss_scaling_tensor},
+                                            {{"power", static_cast<float>(1)},
+                                             {"scale", static_cast<float>(1)},
+                                             {"shift", static_cast<float>(0)}});
 
         runner_p4.Run(stream);
       }
@@ -165,7 +165,7 @@ class LazyZerosNPU {
       }
 
       zero_tensor->mutable_data<T>(place);
-      auto runner_zeros =
+      const auto& runner_zeros =
           NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
       runner_zeros.Run(stream);
       zero_tensor->check_memory_size();
diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc
index 93689d5e495f33484d2f05b04d25734a8c5ab07e..4f4b7d544a0d8b44453a62b461cf52802aac83d2 100644
--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt
index 54008336a9f67f0123ba1cfa6fcea35b79b7ac4c..e5023d8eb354aedd221d9b4e86963a5b8d30390b 100644
--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
@@ -1,3 +1,3 @@
 cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
         DEPS memory timer framework_proto proto_desc lod_tensor op_registry
-        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function)
diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc
index 0de0f5e4505795f69f1d80e2bbc1600250fc7391..4efaecbe9a5b809192c50fd6341577f04bd1b247 100644
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Cast", {*x}, {*out},
-                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
+    const auto& runner = NpuOpRunner(
+        "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..094ef9c8d4ef6e58e4ad639ffbf32b5ea2e68561
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CEmbeddingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
+
+    VLOG(5) << "ids rank is " << ids_rank << std::endl;
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "ShapeError: The dimensions of the 'c_embedding' must be 2. "
+            "But received c_embedding's dimensions = %d, "
+            "c_embedding's shape = [%s].",
+            table_dims.size(), table_dims));
+
+    auto output_dims = framework::vectorize(ids_dims);
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int64 "
+             "contains the ids to be looked up in W.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+
+    AddAttr<int64_t>("start_index",
+                     "(int64, default 0), The starting index is indeed, "
+                     "and the out-of-bounds will be set to 0 ")
+        .SetDefault(0);
+    AddComment(R"DOC(
+c_embedding Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(CEmbeddingGradOpNoBufferVarsInferer, "W");
+
+template <typename T>
+class CEmbeddingGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_embedding_grad");
+
+    op->SetInput("W", this->Input("W"));
+    op->SetInput("Ids", this->Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
+
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+class CEmbeddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+
+class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = framework::GradVarName("W");
+    VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W")
+            << " is set to LoDTensor";
+    ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker,
+                  ops::CEmbeddingGradOpMaker<paddle::framework::OpDesc>,
+                  ops::CEmbeddingGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad,
+                  ops::CEmbeddingGradOpNoBufferVarsInferer,
+                  ops::CEmbeddingOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel<float>,
+                       ops::CEmbeddingOpCPUKernel<double>);
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ecf3887eef4ac6a8af7538789ec5fc56691b83bb
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -0,0 +1,161 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T, typename IndexT>
+__global__ void CEmbedding(T *out, const T *table, const IndexT *ids,
+                           const int rows, const int columns, const int64_t N,
+                           const int64_t start_idx, const int64_t end_idx,
+                           const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      PADDLE_ENFORCE(real_idx < N,
+                     "The index is out of bounds, "
+                     "please check whether the dimensions of index and "
+                     "input meet the requirements. It should "
+                     "be less than [%d], but received [%d]",
+                     N, real_idx);
+      out[i] = table[real_idx * columns + col];
+    } else {
+      out[i] = static_cast<T>(0);
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void CEmbeddingGrad(T *table, const T *output, const IndexT *ids,
+                               const int rows, const int columns,
+                               const int64_t N, const int64_t start_idx,
+                               const int64_t end_idx, const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
+                                      output[i]);
+    }
+  }
+}
+
+template <typename T>
+class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
+
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = ids_t->numel();
+
+    const int64_t end_idx = start_idx + N;
+
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbedding<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int32_t>(), K, D, N, start_idx, end_idx,
+          limit);
+
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbedding<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int64_t>(), K, D, N, start_idx, end_idx,
+          limit);
+    }
+  }
+};
+
+template <typename T>
+class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    auto ids_t = context.Input<LoDTensor>("Ids");
+    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
+    int K = ids_t->numel();
+
+    const int64_t end_idx = start_idx + N;
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    const T *d_output = d_output_t->data<T>();
+    T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbeddingGrad<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int32_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbeddingGrad<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int64_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_embedding, ops::CEmbeddingCUDAKernel<float>,
+                        ops::CEmbeddingCUDAKernel<double>,
+                        ops::CEmbeddingCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_embedding_grad, ops::CEmbeddingGradCUDAKernel<float>,
+                        ops::CEmbeddingGradCUDAKernel<double>,
+                        ops::CEmbeddingGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3cab6d7184441df4c87382904e7a1d35caddfbca
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_embedding_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc
index 03046d571d0f0542ff714868205d5a0aa285e685..37ec989f3f981227e37deb277c32301926723ed5 100644
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel {
                           rank, nranks));
 
     framework::DDim dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        dim[dim.size() - 1] % nranks, 0,
+        platform::errors::InvalidArgument("The last dimension (%d) of the X "
+                                          "should be divisible by nranks (%d)",
+                                          dim[dim.size() - 1], nranks));
+
     dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
     if (dim[0] < 0) dim[0] = -1;
     ctx->SetOutputDim("Out", dim);
diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu
similarity index 65%
rename from paddle/fluid/operators/collective/c_split_op.cu.cc
rename to paddle/fluid/operators/collective/c_split_op.cu
index 92a7f5e41b1d2d8a1e3f4582ad014f630010c8ca..034accbb480c78be767e5b2900ccc376cfa5f635 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,10 +16,38 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 
 namespace paddle {
 namespace operators {
 
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void SplitFromRank(const T* input, T* output, const int rows,
+                              const int columns, const int rank,
+                              const int nranks, const int limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    int row = i / columns;
+    int col = i % columns;
+
+    int block = columns / nranks;
+    int start = block * rank;
+    int end = start + block;
+
+    if (col >= start && col < end) {
+      int idx = block * row + col % block;
+      output[idx] = input[i];
+    }
+  }
+}
+
 template <typename T>
 class CSplitOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
                           rank, nranks));
 
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> shape_refer;
-    std::vector<framework::Tensor*> results;
-    size_t numel = x->numel();
     auto dims = x->dims();
-    numel /= nranks;
-    int axis = dims.size() - 1;
-    dims[dims.size() - 1] /= nranks;
-    for (int i = 0; i < nranks; i++) {
-      framework::Tensor* out = new framework::Tensor();
-      out->mutable_data<T>(dims, place);
-      shape_refer.emplace_back(out);
-      results.emplace_back(out);
-    }
+    auto dims_size = dims.size();
+    // final dim
+    int64_t end_size = dims[dims_size - 1];
 
-    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
-    functor(dev_ctx, *x, shape_refer, axis, &results);
+    // remain dim
+    auto remain_ddim = framework::slice_ddim(dims, 0, dims_size - 1);
+    int64_t remain_numel = framework::product(remain_ddim);
+
+    int limit = x->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+
+    dims[dims_size - 1] /= nranks;
     out->mutable_data<T>(dims, place);
-    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+
+    SplitFromRank<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        x->data<T>(), out->data<T>(), remain_numel, end_size, rank, nranks,
+        limit);
   }
 };
 }  // namespace operators
diff --git a/paddle/fluid/operators/compat/batch_norm.pbtxt b/paddle/fluid/operators/compat/batch_norm.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c18b4dc19dc2e70239c8e36d40fdb66c7d384434
--- /dev/null
+++ b/paddle/fluid/operators/compat/batch_norm.pbtxt
@@ -0,0 +1,94 @@
+type: "batch_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  inputs {
+    name: "Mean"
+  }
+  inputs {
+    name: "Variance"
+  }
+  outputs {
+    name: "Y"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+}
+extra {
+  inputs {
+    name: "MomentumTensor"
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "momentum"
+    type: FLOAT
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_with_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_global_stats"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trainable_statistics"
+    type: BOOLEAN
+  }
+  outputs {
+    name: "MeanOut"
+  }
+  outputs {
+    name: "VarianceOut"
+  }
+  outputs {
+    name: "SavedMean"
+  }
+  outputs {
+    name: "SavedVariance"
+  }
+  outputs {
+    name: "ReserveSpace"
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/concat.pbtxt b/paddle/fluid/operators/compat/concat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..54c8e089829eb1910820c363f422cecea3af45c5
--- /dev/null
+++ b/paddle/fluid/operators/compat/concat.pbtxt
@@ -0,0 +1,50 @@
+type: "concat"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxisTensor"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..94073800f72461abfab7172dace7afeba9f19c09
--- /dev/null
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
@@ -0,0 +1,149 @@
+type: "conv2d"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+}
+extra {
+  inputs {
+    name: "ResidualData"
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_relu_before_depthwise_conv"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_brelu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_brelu_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_addto"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_residual_connection"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "Scale_in"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_in_eltwise"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_weights"
+    type: FLOATS
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "workspace_size_MB"
+    type: INT
+  }
+  attrs {
+    name: "exhaustive_search"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7e3ecb22152b561bbda8d7e108df98d39169818e
--- /dev/null
+++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
@@ -0,0 +1,110 @@
+type: "reduce_mean"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }  
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "output_padding"
+    type: INTS
+  }
+  attrs {
+    name: "output_size"
+    type: INTS
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "workspace_size_MB"
+    type: INT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3e96147ef88ebba13e9c26e4ef164bc2e9231358
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
@@ -0,0 +1,73 @@
+type: "elementwise_add"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+    # no longer to use
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+    # no longer to use
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+    # no longer to use, Use 'mkldnn_data_type' instead.
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40e9d90dbfd89a9deefe4c6d6bec0cc432b63c13
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_div"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_pow.pbtxt b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3ad21423e40aba18ef99ff9959a601c1011fd4b3
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_pow"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b449e76ca06443a8adf65d0691654ec39c9c14f5
--- /dev/null
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
@@ -0,0 +1,74 @@
+type: "elementwise_sub"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "act"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..22954c9ba22ce4d4a88deca4f7e8b4a559971a78
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
@@ -0,0 +1,46 @@
+type: "fake_channel_wise_quantize_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  attrs {
+    name: "quant_axis"
+    type: INT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0a55c0e44862ce8aba6fbe07dfad73382266c426
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
@@ -0,0 +1,38 @@
+type: "fake_dequantize_max_abs"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "max_range"
+    type: FLOAT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..92ee54eb94c0e1da2d2069f722fded5c5b9ba66d
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
@@ -0,0 +1,38 @@
+type: "fake_quantize_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dddb58f827ea036133649c5fb8a79869ed20f38b
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
@@ -0,0 +1,61 @@
+type: "fake_quantize_moving_average_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "InScale"
+  }
+  inputs {
+    name: "InAccum"
+  }
+  inputs {
+    name: "InState"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  outputs {
+    name: "OutState"
+  }
+  outputs {
+    name: "OutAccum"
+  }
+  attrs {
+    name: "moving_rate"
+    type: FLOAT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1050b724ee6b44e44945309b06c6bde6cda18631
--- /dev/null
+++ b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
@@ -0,0 +1,55 @@
+type: "fake_quantize_range_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "InScale"
+  }
+  inputs {
+    name: "Iter"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  outputs {
+    name: "OutScales"
+  }
+  attrs {
+    name: "window_size"
+    type: INT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fc.pbtxt b/paddle/fluid/operators/compat/fc.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..55e1a22ce4da5f936487b0d2517ec2c76f0f8e5b
--- /dev/null
+++ b/paddle/fluid/operators/compat/fc.pbtxt
@@ -0,0 +1,97 @@
+type: "fc"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "W"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "in_num_col_dims"
+    type: INT
+  }
+  attrs {
+    name: "activation_type"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "padding_weights"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "weight_scale"
+    type: FLOATS
+  }
+  attrs {
+    name: "Input_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "enable_int8"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_fc_padding"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_gpu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b525da04a0d88b621bb8fe11ea4ecf5929921822
--- /dev/null
+++ b/paddle/fluid/operators/compat/fill_constant.pbtxt
@@ -0,0 +1,61 @@
+type: "fill_constant"
+def {
+  inputs {
+    name: "ValueTensor"
+  }
+  inputs {
+    name: "ShapeTensor"
+  }
+  inputs {
+    name: "ShapeTensorList"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "dtype"
+    type: INT
+  }
+  attrs {
+    name: "shape"
+    type: LONGS
+  }
+  attrs {
+    name: "value"
+    type: FLOAT
+  }
+  attrs {
+    name: "str_value"
+    type: STRING
+  }
+}
+extra {
+  attrs {
+    name: "force_cpu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "place_type"
+    type: INT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/hard_swish.pbtxt b/paddle/fluid/operators/compat/hard_swish.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ccf387652ed32569aa35fe6bf7a5d155c2364b98
--- /dev/null
+++ b/paddle/fluid/operators/compat/hard_swish.pbtxt
@@ -0,0 +1,44 @@
+type: "hard_swish"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "offset"
+    type: FLOAT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/leaky_relu.pbtxt b/paddle/fluid/operators/compat/leaky_relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9df2e5916118c534530c0c7d0a12b3dabe0a1cb9
--- /dev/null
+++ b/paddle/fluid/operators/compat/leaky_relu.pbtxt
@@ -0,0 +1,40 @@
+type: "leaky_relu"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b40c05ad2e0333356163f2b9ba7b8890037d4bd9
--- /dev/null
+++ b/paddle/fluid/operators/compat/mul.pbtxt
@@ -0,0 +1,87 @@
+type: "mul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "x_num_col_dims"
+    type: INT
+  }
+  attrs {
+    name: "y_num_col_dims"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "skip_quant"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_y"
+    type: FLOATS
+  }
+  attrs {
+    name: "scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "enable_int8"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "X_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "weight_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_scale"
+    type: FLOAT
+  }
+  attrs {
+    name: "out_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+
+}
diff --git a/paddle/fluid/operators/compat/reduce_mean.pbtxt b/paddle/fluid/operators/compat/reduce_mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eea6ad127fd4520b30ca8dc7222fca425ba399da
--- /dev/null
+++ b/paddle/fluid/operators/compat/reduce_mean.pbtxt
@@ -0,0 +1,55 @@
+type: "reduce_mean"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "dim"
+    type: INTS
+  }
+  attrs {
+    name: "keep_dim"
+    type: BOOLEAN
+  }
+}
+extra {
+  attrs {
+    name: "reduce_all"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "in_dtype"
+    type: INT
+  }
+  attrs {
+    name: "out_dtype"
+    type: INT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/relu.pbtxt b/paddle/fluid/operators/compat/relu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..359bd70c2a310c0ea64da383c416482dfd28403e
--- /dev/null
+++ b/paddle/fluid/operators/compat/relu.pbtxt
@@ -0,0 +1,43 @@
+type: "relu"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+}
diff --git a/paddle/fluid/operators/compat/relu6.pbtxt b/paddle/fluid/operators/compat/relu6.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..edd29037324430702ba70e9632d72f01b339b390
--- /dev/null
+++ b/paddle/fluid/operators/compat/relu6.pbtxt
@@ -0,0 +1,40 @@
+type: "relu6"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/seqconv.pbtxt b/paddle/fluid/operators/compat/seqconv.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..d05aabcc0aa4099a524d32b106c29c148329f472
--- /dev/null
+++ b/paddle/fluid/operators/compat/seqconv.pbtxt
@@ -0,0 +1,34 @@
+type: "sequence_conv"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "PaddingData"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "paddingTrainable"
+    type: BOOLEAN
+  }
+   attrs {
+    name: "contextLength"
+    type: INT
+   }
+   attrs {
+    name: "contextStart"
+    type: INT
+   }
+   attrs {
+    name: "contextStride"
+    type: INT
+   }
+   
+}
diff --git a/paddle/fluid/operators/compat/sequence_expand.pbtxt b/paddle/fluid/operators/compat/sequence_expand.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..38169d7b57ded849af1886828f4ae18fd2b7841d
--- /dev/null
+++ b/paddle/fluid/operators/compat/sequence_expand.pbtxt
@@ -0,0 +1,38 @@
+type: "sequence_expand"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "ref_level"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/sigmoid.pbtxt b/paddle/fluid/operators/compat/sigmoid.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b53aa402c1183d3f9688cc8528ad42dcd10e1b5
--- /dev/null
+++ b/paddle/fluid/operators/compat/sigmoid.pbtxt
@@ -0,0 +1,39 @@
+type: "sigmoid"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/sqrt.pbtxt b/paddle/fluid/operators/compat/sqrt.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2dbcba802a4086e841080399300eb95f8ba1069d
--- /dev/null
+++ b/paddle/fluid/operators/compat/sqrt.pbtxt
@@ -0,0 +1,39 @@
+type: "sqrt"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/swish.pbtxt b/paddle/fluid/operators/compat/swish.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4f5ec127e489794742f88d5589847e598956b981
--- /dev/null
+++ b/paddle/fluid/operators/compat/swish.pbtxt
@@ -0,0 +1,40 @@
+type: "swish"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
+
diff --git a/paddle/fluid/operators/compat/tanh.pbtxt b/paddle/fluid/operators/compat/tanh.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a0e6cf8a0a90add80200a524e2721eec00a07751
--- /dev/null
+++ b/paddle/fluid/operators/compat/tanh.pbtxt
@@ -0,0 +1,39 @@
+type: "tanh"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc
index 87bb3397ca2672ce377b74682cb0445e31b03677..d242c9f8c3fbd538b3ec0ce95fa5929c7c8ccd0a 100644
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
@@ -52,9 +52,11 @@ class ConcatNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    auto runner = NpuOpRunner(
-        "ConcatD", {inputs}, {*out},
-        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}});
+    NpuOpRunner runner{
+        "ConcatD",
+        {inputs},
+        {*out},
+        {{"concat_dim", axis}, {"N", static_cast<int>(inputs.size())}}};
     runner.AddInputNames(names);
     runner.Run(stream);
   }
@@ -101,8 +103,9 @@ class ConcatGradNPUKernel : public framework::OpKernel<T> {
             sizes.push_back(ins[j]->dims()[dim]);
           }
         }
-        auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
-                                  {{"offsets", offsets}, {"size", sizes}});
+        const auto& runner =
+            NpuOpRunner("SliceD", {*out_grad}, {*outs[j]},
+                        {{"offsets", offsets}, {"size", sizes}});
         runner.Run(stream);
       }
       if (ins[j]->numel() != 0UL) {
diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu
index a60201f9d07d69897ec81ced54964a50a9d84795..cc0c46adb119a160d166e9093cc4ff677d8bd4e0 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
@@ -13,18 +13,84 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/compare_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
-REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor,
-                        paddle::operators::GreaterThanFunctor);
-REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor,
-                        paddle::operators::GreaterEqualFunctor);
-REGISTER_COMPARE_KERNEL(greater_than, CUDA,
-                        paddle::operators::GreaterThanFunctor,
-                        paddle::operators::LessThanFunctor);
-REGISTER_COMPARE_KERNEL(greater_equal, CUDA,
-                        paddle::operators::GreaterEqualFunctor,
-                        paddle::operators::LessEqualFunctor);
-REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor,
-                        paddle::operators::EqualFunctor);
-REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor,
-                        paddle::operators::NotEqualFunctor);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+#define DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(func, op) \
+  template <typename T, typename Enable = void>               \
+  struct func {                                               \
+    using ELEMENT_TYPE = T;                                   \
+    inline HOSTDEVICE bool operator()(const T* args) const {  \
+      return args[0] op args[1];                              \
+    }                                                         \
+  };
+
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessThanFunctor, <)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessEqualFunctor, <=)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterThanFunctor, >)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterEqualFunctor, >=)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaEqualFunctor, ==)
+DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaNotEqualFunctor, !=)
+#undef DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT
+
+template <typename T>
+struct CudaEqualFunctor<
+    T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const {
+    return fabs(static_cast<double>(args[0] - args[1])) < 1e-8;
+  }
+};
+
+template <typename T>
+struct CudaNotEqualFunctor<
+    T, typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const {
+    return fabs(static_cast<double>(args[0] - args[1])) > 1e-8;
+  }
+};
+
+template <typename Functor, typename InverseFunctor>
+class CompareOpKernel<platform::CUDADeviceContext, Functor, InverseFunctor>
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+ public:
+  using InT = typename Functor::ELEMENT_TYPE;
+  using OutT = bool;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto functor = Functor();
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+        cuda_ctx, ins, &outs, axis, functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func)                            \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      op_type,                                                                 \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int>, void>,     \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<int64_t>, void>, \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<float>, void>,   \
+      ops::CompareOpKernel<plat::CUDADeviceContext, ops::func<double>, void>);
+
+REGISTER_CUDA_COMPARE_KERNEL(equal, CudaEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(not_equal, CudaNotEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(less_than, CudaLessThanFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(less_equal, CudaLessEqualFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(greater_than, CudaGreaterThanFunctor)
+REGISTER_CUDA_COMPARE_KERNEL(greater_equal, CudaGreaterEqualFunctor)
+#undef REGISTER_CUDA_COMPARE_KERNEL
diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc
index 591fb55936734ffc675dad5c6912e7cbf4e80471..d1656fd079cd76446d12e553a1ff37af5bfeeeaa 100644
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
@@ -34,7 +34,7 @@ class EqualNPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     out->mutable_data<bool>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -51,7 +51,7 @@ class LessThanNPUKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<framework::LoDTensor>("Out");
     // int axis = context.Attr<int>("axis");
     z->mutable_data<bool>(ctx.GetPlace());  // allocate
-    auto runner = NpuOpRunner("Less", {*x, *y}, {*z});
+    const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index 62019be26cdef8214fe0e7c3e063c9387a30c91a..6705d42bcd74086e327d54fa44b9daf03efcba40 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -73,6 +73,8 @@ class ConditionalBlockInferOp : public ConditionalOp {
 
       framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
+      VLOG(3) << "Conditional block.idx = " << block->ID()
+              << ", scope = " << &cur_scope;
       exec.Run(*block->Program(), &cur_scope, block->ID(), false);
       scope.DeleteScope(scopes->front());
     }
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 7ca54b488bfbb260c422941b82145f092a150be7..6cbcd516e08264499afdea00d081ae93eb8b319b 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -13,12 +13,68 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
-                               paddle::operators::LogicalAndFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
-                               paddle::operators::LogicalOrFunctor);
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
-                              paddle::operators::LogicalNotFunctor);
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
-                               paddle::operators::LogicalXorFunctor);
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+namespace paddle {
+namespace operators {
+
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)         \
+  template <typename T>                               \
+  struct func_name {                                  \
+    using ELEMENT_TYPE = T;                           \
+    HOSTDEVICE bool operator()(const T* args) const { \
+      return args[0] op args[1];                      \
+    }                                                 \
+  };
+
+LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
+
+template <typename T>
+struct CudaNotFunctor {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T* args) const { return !args[0]; }
+};
+
+template <typename Functor>
+class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using InT = typename Functor::ELEMENT_TYPE;
+  using OutT = bool;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto functor = Functor();
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
+
+    if (ins.size() == 1) {
+      LaunchElementwiseCudaKernel<ElementwiseType::kUnary, InT, OutT>(
+          cuda_ctx, ins, &outs, axis, functor);
+    } else {
+      LaunchElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutT>(
+          cuda_ctx, ins, &outs, axis, functor);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
+  REGISTER_OP_CUDA_KERNEL(                          \
+      op_name,                                      \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>);
+
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor)
+#undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 1b0c0e444347af0a90f8244590b84199dc97f931..b9807bfa53e1e116089f5a593d69f5110b0b8f10 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -40,7 +40,7 @@ class LogicalNotNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index 9825fcd8a6a67b9fd21e70e0870cc904ca9a9dbf..c6cd45dc18ba323407e3b3a0d5729c3b19a10c47 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/conv_search_cache.h"
 #include "paddle/fluid/framework/operator_kernel_configs.h"
 #include "paddle/fluid/operators/conv_cudnn_op_cache.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
 namespace paddle {
 namespace operators {
@@ -58,8 +59,8 @@ static void RemovePaddingSlice(const framework::ExecutionContext& context,
       *context.template device_context<DeviceContext>().eigen_device();
   auto in_dims = input->dims();
   auto new_out_dims = out->dims();
-  auto offsets = Eigen::array<int, D>();
-  auto extents = Eigen::array<int, D>();
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
   for (size_t i = 0; i < D; ++i) {
     offsets[i] = 0;
     extents[i] = new_out_dims[i];
@@ -81,7 +82,8 @@ static void RemovePaddingSlice(const framework::ExecutionContext& context,
   auto out_t =
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *out, new_out_dims);
-  out_t.device(place) = in_t.slice(offsets, extents);
+  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
+                                                        offsets, extents);
 }
 
 template <typename T>
@@ -209,20 +211,31 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
 #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        args.cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
     if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
                                                          CUDNN_TENSOR_OP_MATH));
       VLOG(5) << "use cudnn_tensor_op_math";
-    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
 #if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (dev_ctx.GetComputeCapability() >= 80 &&
+               dtype == CUDNN_DATA_BFLOAT16) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
+                                                         CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "use cudnn_tensor_op_math";
+#endif  // CUDNN_VERSION >= 8100
+    } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) {
       PADDLE_ENFORCE_CUDA_SUCCESS(
           platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
                                                          CUDNN_FMA_MATH));
+      VLOG(5) << "use cudnn_fma_math";
 #endif  // CUDA_VERSION >= 11000
+    } else {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(),
+                                                         CUDNN_DEFAULT_MATH));
+      VLOG(5) << "use cudnn_default_math";
     }
 #endif
 
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 7fdb1ccfe9614fc0b30c7e13f564ece217c08b36..c49a3ee1c20ed32bd8d0504a28e4d7bb5f9917e3 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -1413,6 +1413,31 @@ REGISTER_OP_KERNEL(
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
 #else
+#if CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>,
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
+REGISTER_OP_KERNEL(
+    conv2d_grad_grad, CUDNN, plat::CUDAPlace,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    depthwise_conv2d_grad_grad,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
+#else
 REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
                    paddle::operators::CUDNNConvOpKernel<double>,
@@ -1432,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL(
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
+#endif
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 85bb4e5baa058a4cc5e6e4b9e1aec9ac75b3c5ea..1266cfe6081acf46fe66212adda23a396601965f 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -73,7 +73,17 @@ std::vector<int64_t> ConvOp::ComputeOutputShape(
           "the filter's dimension is %d.",
           in_dims, in_dims.size(), filter_dims, filter_dims.size()));
 
-  int in_sub_stride_size = in_dims.size() - strides.size();
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i], 0,
+        platform::errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = in_dims.size() - stride_size;
   PADDLE_ENFORCE_EQ(
       in_dims.size(), strides.size() + 2U,
       platform::errors::InvalidArgument(
@@ -189,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
                       platform::errors::InvalidArgument(
                           "float16 can only be used when CUDNN is used"));
   }
+#if PADDLE_WITH_CUDA
+  if (input_data_type == framework::proto::VarType::BF16 &&
+      library == framework::LibraryType::kCUDNN) {
+    PADDLE_ENFORCE_GE(
+        platform::CudnnVersion(), 8100,
+        platform::errors::InvalidArgument(
+            "bfloat16 can only be used when CUDNN_VERSION >= 8100"));
+  }
+#endif  // PADDLE_WITH_CUDA
 
   auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
                                       library, customized_type_value);
diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc
index 4ea936d5104b83ce30e43fe214e7f1e0936325ee..f004ea1c69e0c5ba69f26a1e3141e6e407fad4be 100644
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
@@ -66,7 +66,19 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
           "input is [%s], the dimension size of input is [%d], the shape "
           "of filter is [%s],  the dimension size of filter is [%d]. ",
           in_dims, in_dims.size(), filter_dims, filter_dims.size()));
-  int in_sub_stride_size = in_dims.size() - strides.size();
+
+  int stride_size = strides.size();
+  for (int i = 0; i < stride_size; ++i) {
+    PADDLE_ENFORCE_GT(
+        strides[i], 0,
+        platform::errors::InvalidArgument(
+            "The stride of Op(Conv) should be larget than 0, but received "
+            "stride is %d.",
+            strides[i]));
+  }
+
+  int in_sub_stride_size = in_dims.size() - stride_size;
+
   PADDLE_ENFORCE_EQ(
       in_dims.size() - strides.size(), 2U,
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index ecf5b6d774a2605c06bbeb2514c981b46e7f6a0d..b8335c75064286625997d2874fb076721afdde85 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_op.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
@@ -40,8 +41,8 @@ static void Slice(const framework::ExecutionContext& context,
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
   auto in_dims = input->dims();
-  auto offsets = Eigen::array<int, D>();
-  auto extents = Eigen::array<int, D>();
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
   for (size_t i = 0; i < D; ++i) {
     offsets[i] = 0;
     extents[i] = in_dims[i];
@@ -64,7 +65,8 @@ static void Slice(const framework::ExecutionContext& context,
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *out, out_dims);
 
-  out_t.device(place) = in_t.slice(offsets, extents);
+  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
+                                                        offsets, extents);
   out->Resize(out_dims);
 }
 
diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc
index 2031ed14242a1a2b4a441bf171bfeb31790506a3..193c0ca8dc0f4dbb6eff06f4899c53e7bf460cf7 100644
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
@@ -220,3 +220,10 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::CropGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CropKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CropGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu
deleted file mode 100644
index 0a83e6aa57155b3bd85f8be02be9fa2f9cab39a8..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/crop_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    crop, ops::CropKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h
index 0338495096a7b1553152a80a68dc4e054859105c..f1fc216bd4feb470e0c811344428239c3ff9c9da 100644
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -89,15 +90,16 @@ void CropFunction(const framework::ExecutionContext& context) {
 
   auto x_tensor = EigenTensor<T, D>::From(*x);
   auto out_tensor = EigenTensor<T, D>::From(*out);
-  Eigen::array<int, D> e_offsets;
-  Eigen::array<int, D> e_shape;
+  Eigen::DSizes<Eigen::DenseIndex, D> e_offsets;
+  Eigen::DSizes<Eigen::DenseIndex, D> e_shape;
   for (size_t i = 0; i < D; ++i) {
     e_offsets[i] = offsets[i];
     e_shape[i] = out->dims()[i];
   }
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
+  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_tensor, x_tensor, e_offsets, e_shape);
 }
 
 template <typename DeviceContext, typename T>
@@ -148,16 +150,17 @@ void CropGradFunction(const framework::ExecutionContext& context) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(x->dims(), context.GetPlace());
     auto offsets = GetOffsets(context);
-    Eigen::array<std::pair<int, int>, D> paddings;
+    Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        d_out_tensor.pad(paddings, 0);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+        place, d_x_tensor, d_out_tensor, paddings, static_cast<T>(0));
   }
 }
 
diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc
index 514333c57f57cf3efa7b40f07d1a7c024e1d1715..28238082b18bf1279cb1ef4649aa8fd465c50b6b 100644
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
@@ -319,3 +319,16 @@ REGISTER_OP_CPU_KERNEL(
     ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::CropTensorGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    crop_tensor,
+    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_tensor_grad,
+    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/crop_tensor_op.cu b/paddle/fluid/operators/crop_tensor_op.cu
deleted file mode 100644
index c3a144d1719d041dd56323850de04f6a1c71b29a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/crop_tensor_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/crop_tensor_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CropTensorKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    crop_tensor_grad,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::CropTensorGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h
index 58960465b90bd0eb427f78b00dfe21a7b0e7abe8..54666c8482c021bee2b9cc2679ccf4a65daf4cd7 100644
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 
 namespace paddle {
@@ -199,15 +200,16 @@ void CropTensorFunction(const framework::ExecutionContext& context) {
 
   auto x_tensor = EigenTensor<T, D>::From(*x);
   auto out_tensor = EigenTensor<T, D>::From(*out);
-  Eigen::array<int, D> e_offsets;
-  Eigen::array<int, D> e_shape;
+  Eigen::DSizes<Eigen::DenseIndex, D> e_offsets;
+  Eigen::DSizes<Eigen::DenseIndex, D> e_shape;
   for (size_t i = 0; i < D; ++i) {
     e_offsets[i] = offsets[i];
     e_shape[i] = out->dims()[i];
   }
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape);
+  EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_tensor, x_tensor, e_offsets, e_shape);
 }
 
 template <typename DeviceContext, typename T>
@@ -259,16 +261,17 @@ void CropTensorGradFunction(const framework::ExecutionContext& context) {
     auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
     d_x->mutable_data<T>(x->dims(), context.GetPlace());
     auto offsets = GetOffsets(context);
-    Eigen::array<std::pair<int, int>, D> paddings;
+    Eigen::array<std::pair<int64_t, int64_t>, D> paddings;
     for (size_t i = 0; i < D; ++i) {
       paddings[i].first = offsets[i];
       paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
     }
     auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
     auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
-    d_x_tensor.device(
-        *context.template device_context<DeviceContext>().eigen_device()) =
-        d_out_tensor.pad(paddings, 0);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+        place, d_x_tensor, d_out_tensor, paddings, static_cast<T>(0));
   }
 }
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 0987118ba39b6ec6893ea3914a30ff477c42d6a6..09d607891b48542876a374cbf00db713befde4b2 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -23,8 +23,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-using complex64 = platform::complex64;
-using complex128 = platform::complex128;
 
 template <typename T, typename R>
 struct P {
diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt
index 848bf2433c5e394bf00f4b335b83da4e0fdec144..8b64e35b93526eb7edbe7f723832126ef7f0e0a6 100644
--- a/paddle/fluid/operators/eigen/CMakeLists.txt
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
@@ -1,10 +1,9 @@
 file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
-cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
-if(WITH_GPU OR WITH_ROCM)
-  file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
-  if(WITH_GPU)
-    nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
-  elseif(WITH_ROCM)
-    hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3)
-  endif()
+file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu")
+if(WITH_GPU)
+  nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+elseif(WITH_ROCM)
+  hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3)
+else()
+  cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3)
 endif()
diff --git a/paddle/fluid/operators/eigen/constant.cc b/paddle/fluid/operators/eigen/constant.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45b03ccbf10043ad142c7de15d7cdf110e134f9a
--- /dev/null
+++ b/paddle/fluid/operators/eigen/constant.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenConstant<Eigen::DefaultDevice, T, Rank> {
+  using Type = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, Type out, const T value) {
+    out.device(dev) = out.constant(value);
+  }
+};
+
+template struct EigenConstant<Eigen::DefaultDevice, float, 1>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/constant.cu b/paddle/fluid/operators/eigen/constant.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cf4a2917f7d36f817b53aa892ff1b43b347086c8
--- /dev/null
+++ b/paddle/fluid/operators/eigen/constant.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenConstant<Eigen::GpuDevice, T, Rank> {
+  using Type = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, Type out, const T value) {
+    out.device(dev) = out.constant(value);
+  }
+};
+
+template struct EigenConstant<Eigen::GpuDevice, float, 1>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h
index 59669505959f3f2b9d2b5d378e1e0b297df1718e..9a3be7ca439b9aead2e931c7fa3036128400b057 100644
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
@@ -12,6 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -48,5 +54,207 @@ struct EigenBroadcastGrad {
                    const Array& reduce_dims, const Array2& reshape_dims);
 };
 
+template <typename EigenDevice, typename T, int Rank>
+struct EigenConstant {
+  using Type = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, Type out, const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenSign {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenReverse {
+  using Array = Eigen::DSizes<bool, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
+                   const Array& reverse);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenAdd {
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
+                   const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenSub {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& left,
+                   const InType& right);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenSlice {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array32Bit = Eigen::DSizes<int, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
+                   const Array& offsets, const Array& extents);
+  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& offsets,
+                   const Array32Bit& extents);
+};
+
+template <typename EigenDevice, typename T, int Rank>
+struct EigenPad {
+  using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
+  using Array32Bit = std::array<std::pair<int, int>, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
+                   const Array& padding, const T value);
+  static void Eval(const EigenDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& padding,
+                   const T value);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenScale {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in,
+                   const T scale, const T bias, const bool bias_after_scale);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenErf {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenErfGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType din, const InType& in,
+                   const InType& dout);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenRankLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& label,
+                   const InType& left, const InType& right);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenRankLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void EvalLeft(const EigenDevice& dev, OutType dleft,
+                       const InType& dout, const InType& label,
+                       const InType& left, const InType& right);
+  static void EvalRight(const EigenDevice& dev, OutType dright,
+                        const InType& dout, const InType& label,
+                        const InType& left, const InType& right);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenLogLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
+                   const InType& pred, const InType& label, const T& epsilon);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenHingeLoss {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType loss, const InType& pred,
+                   const InType& label);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenHingeLossGrad {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss,
+                   const InType& pred, const InType& label);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenL1Norm {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType out, const InType& in);
+};
+
+template <typename EigenDevice, typename T>
+struct EigenL1NormGrad {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const EigenDevice& dev, OutType din, const InType& dout,
+                   const InType& in, const Array& bcast);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/elementwise.cc b/paddle/fluid/operators/eigen/elementwise.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bedecfe5c224feda5126050be1f80843db5b0a87
--- /dev/null
+++ b/paddle/fluid/operators/eigen/elementwise.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenAdd<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in, const T value) {
+    out.device(dev) = in + value;
+  }
+};
+
+template struct EigenAdd<Eigen::DefaultDevice, float>;
+template struct EigenAdd<Eigen::DefaultDevice, double>;
+template struct EigenAdd<Eigen::DefaultDevice, int>;
+template struct EigenAdd<Eigen::DefaultDevice, int64_t>;
+
+template <typename T>
+struct EigenSub<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& left, const InType& right) {
+    out.device(dev) = left - right;
+  }
+};
+
+template struct EigenSub<Eigen::DefaultDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/elementwise.cu b/paddle/fluid/operators/eigen/elementwise.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a750a06284f5e44fa71440820e2c40c0868f4e6f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/elementwise.cu
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenAdd<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      const T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+                   const T value) {
+    out.device(dev) = in + value;
+  }
+};
+
+template struct EigenAdd<Eigen::GpuDevice, float>;
+template struct EigenAdd<Eigen::GpuDevice, double>;
+template struct EigenAdd<Eigen::GpuDevice, int>;
+template struct EigenAdd<Eigen::GpuDevice, int64_t>;
+
+template <typename T>
+struct EigenSub<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& left,
+                   const InType& right) {
+    out.device(dev) = left - right;
+  }
+};
+
+template struct EigenSub<Eigen::GpuDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/erf.cc b/paddle/fluid/operators/eigen/erf.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c2c734c97769418fa9316150c606909acf33eba
--- /dev/null
+++ b/paddle/fluid/operators/eigen/erf.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenErf<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in) {
+    out.device(dev) = in.erf();
+  }
+};
+
+template <typename T>
+struct EigenErfGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType din,
+                   const InType& in, const InType& dout) {
+    din.device(dev) =
+        dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
+  }
+};
+
+#define INSTANTIATION(FUNCTOR)                           \
+  template struct FUNCTOR<Eigen::DefaultDevice, float>;  \
+  template struct FUNCTOR<Eigen::DefaultDevice, double>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, platform::float16>
+INSTANTIATION(EigenErf);
+INSTANTIATION(EigenErfGrad);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/erf.cu b/paddle/fluid/operators/eigen/erf.cu
new file mode 100644
index 0000000000000000000000000000000000000000..632205bdcbf7efaf6004e071ea078739742a417f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/erf.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenErf<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) {
+    out.device(dev) = in.erf();
+  }
+};
+
+template <typename T>
+struct EigenErfGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& in,
+                   const InType& dout) {
+    din.device(dev) =
+        dout * static_cast<T>(M_2_SQRTPI) * (-(in.square())).exp();
+  }
+};
+
+#define INSTANTIATION(FUNCTOR)                       \
+  template struct FUNCTOR<Eigen::GpuDevice, float>;  \
+  template struct FUNCTOR<Eigen::GpuDevice, double>; \
+  template struct FUNCTOR<Eigen::GpuDevice, platform::float16>
+INSTANTIATION(EigenErf);
+INSTANTIATION(EigenErfGrad);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/l1_norm.cc b/paddle/fluid/operators/eigen/l1_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7ed60f76662eb7907f4884d93149f6f49bc0bc8
--- /dev/null
+++ b/paddle/fluid/operators/eigen/l1_norm.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenL1Norm<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in) {
+    out.device(dev) = in.abs().sum();
+  }
+};
+
+template <typename T>
+struct EigenL1NormGrad<Eigen::DefaultDevice, T> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType din,
+                   const InType& dout, const InType& in, const Array& bcast) {
+    din.device(dev) = dout.broadcast(bcast) * in.sign();
+  }
+};
+
+template struct EigenL1Norm<Eigen::DefaultDevice, float>;
+template struct EigenL1NormGrad<Eigen::DefaultDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/l1_norm.cu b/paddle/fluid/operators/eigen/l1_norm.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a27cd7ae6b7898d8d7fe4001cdfd447d02e19cb7
--- /dev/null
+++ b/paddle/fluid/operators/eigen/l1_norm.cu
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenL1Norm<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<Eigen::TensorFixedSize<
+      T, Eigen::Sizes<>, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) {
+    out.device(dev) = in.abs().sum();
+  }
+};
+
+template <typename T>
+struct EigenL1NormGrad<Eigen::GpuDevice, T> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, 1>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& dout,
+                   const InType& in, const Array& bcast) {
+    din.device(dev) = dout.broadcast(bcast) * in.sign();
+  }
+};
+
+template struct EigenL1Norm<Eigen::GpuDevice, float>;
+template struct EigenL1NormGrad<Eigen::GpuDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/loss.cc b/paddle/fluid/operators/eigen/loss.cc
new file mode 100644
index 0000000000000000000000000000000000000000..469456537d9aa20564cf9abe2bf1ece735534be3
--- /dev/null
+++ b/paddle/fluid/operators/eigen/loss.cc
@@ -0,0 +1,123 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenRankLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& label, const InType& left,
+                   const InType& right) {
+    out.device(dev) =
+        (1.0f + (left - right).exp()).log() - label * (left - right);
+  }
+};
+
+template <typename T>
+struct EigenRankLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+
+  static void EvalLeft(const Eigen::DefaultDevice& dev, OutType dleft,
+                       const InType& dout, const InType& label,
+                       const InType& left, const InType& right) {
+    dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label);
+  }
+
+  static void EvalRight(const Eigen::DefaultDevice& dev, OutType dright,
+                        const InType& dout, const InType& label,
+                        const InType& left, const InType& right) {
+    dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label);
+  }
+};
+
+template struct EigenRankLoss<Eigen::DefaultDevice, float>;
+template struct EigenRankLossGrad<Eigen::DefaultDevice, float>;
+
+template <typename T>
+struct EigenLogLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& pred, const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::DefaultDevice, float>;
+template struct EigenLogLossGrad<Eigen::DefaultDevice, float>;
+
+template <typename T>
+struct EigenHingeLoss<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType loss,
+                   const InType& pred, const InType& label) {
+    loss.device(dev) = (static_cast<T>(1) -
+                        pred * (static_cast<T>(2) * label - static_cast<T>(1)))
+                           .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct EigenHingeLossGrad<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred,
+                   const InType& label) {
+    auto alt_labels = static_cast<T>(2) * label - static_cast<T>(1);
+    dpred.device(dev) =
+        dloss * ((pred * alt_labels) < static_cast<T>(1)).template cast<T>() *
+        (-alt_labels);
+  }
+};
+
+template struct EigenHingeLoss<Eigen::DefaultDevice, float>;
+template struct EigenHingeLossGrad<Eigen::DefaultDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/loss.cu b/paddle/fluid/operators/eigen/loss.cu
new file mode 100644
index 0000000000000000000000000000000000000000..02341202a2b4f18acc79f7bd4d4c69a69a039eca
--- /dev/null
+++ b/paddle/fluid/operators/eigen/loss.cu
@@ -0,0 +1,123 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenRankLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out,
+                   const InType& label, const InType& left,
+                   const InType& right) {
+    out.device(dev) =
+        (1.0f + (left - right).exp()).log() - label * (left - right);
+  }
+};
+
+template <typename T>
+struct EigenRankLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+
+  static void EvalLeft(const Eigen::GpuDevice& dev, OutType dleft,
+                       const InType& dout, const InType& label,
+                       const InType& left, const InType& right) {
+    dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label);
+  }
+
+  static void EvalRight(const Eigen::GpuDevice& dev, OutType dright,
+                        const InType& dout, const InType& label,
+                        const InType& left, const InType& right) {
+    dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label);
+  }
+};
+
+template struct EigenRankLoss<Eigen::GpuDevice, float>;
+template struct EigenRankLossGrad<Eigen::GpuDevice, float>;
+
+template <typename T>
+struct EigenLogLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred,
+                   const InType& label, const T& epsilon) {
+    out.device(dev) = (-(label * (pred + epsilon).log()) -
+                       ((static_cast<T>(1) - label) *
+                        (static_cast<T>(1) - pred + epsilon).log()));
+  }
+};
+
+template <typename T>
+struct EigenLogLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred, const InType& label,
+                   const T& epsilon) {
+    dpred.device(dev) =
+        dloss *
+        (-(label / (pred + epsilon)) +
+         ((static_cast<T>(1) - label) / (static_cast<T>(1) - pred + epsilon)));
+  }
+};
+
+template struct EigenLogLoss<Eigen::GpuDevice, float>;
+template struct EigenLogLossGrad<Eigen::GpuDevice, float>;
+
+template <typename T>
+struct EigenHingeLoss<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType loss,
+                   const InType& pred, const InType& label) {
+    loss.device(dev) = (static_cast<T>(1) -
+                        pred * (static_cast<T>(2) * label - static_cast<T>(1)))
+                           .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct EigenHingeLossGrad<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType dpred,
+                   const InType& dloss, const InType& pred,
+                   const InType& label) {
+    auto alt_labels = static_cast<T>(2) * label - static_cast<T>(1);
+    dpred.device(dev) =
+        dloss * ((pred * alt_labels) < static_cast<T>(1)).template cast<T>() *
+        (-alt_labels);
+  }
+};
+
+template struct EigenHingeLoss<Eigen::GpuDevice, float>;
+template struct EigenHingeLossGrad<Eigen::GpuDevice, float>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/pad.cc b/paddle/fluid/operators/eigen/pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..421c9eaf5cde2bbbca56512685903ee3dc28fc49
--- /dev/null
+++ b/paddle/fluid/operators/eigen/pad.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/complex.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenPad<Eigen::DefaultDevice, T, Rank> {
+  using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
+  using Array32Bit = std::array<std::pair<int, int>, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in, const Array& padding, const T value) {
+    out.device(dev) = in.pad(padding, value);
+  }
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& padding,
+                   const T value) {
+    out.device(dev) = in.pad(padding, value);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 6>
+INSTANTIATION(EigenPad, int);
+INSTANTIATION(EigenPad, int64_t);
+INSTANTIATION(EigenPad, float);
+INSTANTIATION(EigenPad, double);
+INSTANTIATION(EigenPad, platform::complex<float>);
+INSTANTIATION(EigenPad, platform::complex<double>);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/pad.cu b/paddle/fluid/operators/eigen/pad.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4cf88712d95cbb2e526068ebdfca9999e5fda449
--- /dev/null
+++ b/paddle/fluid/operators/eigen/pad.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenPad<Eigen::GpuDevice, T, Rank> {
+  using Array = std::array<std::pair<int64_t, int64_t>, Rank>;
+  using Array32Bit = std::array<std::pair<int, int>, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+                   const Array& padding, const T value) {
+    out.device(dev) = in.pad(padding, value);
+  }
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& padding,
+                   const T value) {
+    out.device(dev) = in.pad(padding, value);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 6>
+INSTANTIATION(EigenPad, int);
+INSTANTIATION(EigenPad, int64_t);
+INSTANTIATION(EigenPad, float);
+INSTANTIATION(EigenPad, double);
+INSTANTIATION(EigenPad, platform::float16);
+INSTANTIATION(EigenPad, platform::bfloat16);
+INSTANTIATION(EigenPad, platform::complex<float>);
+INSTANTIATION(EigenPad, platform::complex<double>);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/reverse.cc b/paddle/fluid/operators/eigen/reverse.cc
new file mode 100644
index 0000000000000000000000000000000000000000..02044479db952ff27c06148ca39c4a2a3e36330a
--- /dev/null
+++ b/paddle/fluid/operators/eigen/reverse.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenReverse<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<bool, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in, const Array& reverse) {
+    out.device(dev) = in.reverse(reverse);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 6>
+INSTANTIATION(EigenReverse, int);
+INSTANTIATION(EigenReverse, uint8_t);
+INSTANTIATION(EigenReverse, int64_t);
+INSTANTIATION(EigenReverse, bool);
+INSTANTIATION(EigenReverse, float);
+INSTANTIATION(EigenReverse, double);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/reverse.cu b/paddle/fluid/operators/eigen/reverse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9b769489ce723678b2cc1440bf6c3d374e3a55d6
--- /dev/null
+++ b/paddle/fluid/operators/eigen/reverse.cu
@@ -0,0 +1,48 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenReverse<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<bool, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+                   const Array& reverse) {
+    out.device(dev) = in.reverse(reverse);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 6>
+INSTANTIATION(EigenReverse, int);
+INSTANTIATION(EigenReverse, uint8_t);
+INSTANTIATION(EigenReverse, int64_t);
+INSTANTIATION(EigenReverse, bool);
+INSTANTIATION(EigenReverse, float);
+INSTANTIATION(EigenReverse, double);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/scale.cc b/paddle/fluid/operators/eigen/scale.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e85878f20aa2b80b398561938ad96f6349cb7eec
--- /dev/null
+++ b/paddle/fluid/operators/eigen/scale.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenScale<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in, const T scale, const T bias,
+                   const bool bias_after_scale) {
+    if (bias_after_scale) {
+      out.device(dev) = scale * in + bias;
+    } else {
+      out.device(dev) = scale * (in + bias);
+    }
+  }
+};
+
+template struct EigenScale<Eigen::DefaultDevice, float>;
+template struct EigenScale<Eigen::DefaultDevice, double>;
+template struct EigenScale<Eigen::DefaultDevice, platform::bfloat16>;
+template struct EigenScale<Eigen::DefaultDevice, uint8_t>;
+template struct EigenScale<Eigen::DefaultDevice, int8_t>;
+template struct EigenScale<Eigen::DefaultDevice, int16_t>;
+template struct EigenScale<Eigen::DefaultDevice, int>;
+template struct EigenScale<Eigen::DefaultDevice, int64_t>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/scale.cu b/paddle/fluid/operators/eigen/scale.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6a77f72f6200c0640d08e5ba9e1ddfb39211aaed
--- /dev/null
+++ b/paddle/fluid/operators/eigen/scale.cu
@@ -0,0 +1,46 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenScale<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+                   const T scale, const T bias, const bool bias_after_scale) {
+    if (bias_after_scale) {
+      out.device(dev) = scale * in + bias;
+    } else {
+      out.device(dev) = scale * (in + bias);
+    }
+  }
+};
+
+template struct EigenScale<Eigen::GpuDevice, float>;
+template struct EigenScale<Eigen::GpuDevice, double>;
+template struct EigenScale<Eigen::GpuDevice, uint8_t>;
+template struct EigenScale<Eigen::GpuDevice, int8_t>;
+template struct EigenScale<Eigen::GpuDevice, int16_t>;
+template struct EigenScale<Eigen::GpuDevice, int>;
+template struct EigenScale<Eigen::GpuDevice, int64_t>;
+template struct EigenScale<Eigen::GpuDevice, platform::float16>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/sign.cc b/paddle/fluid/operators/eigen/sign.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a4445f6569d388a4181eec1bed2faf190aeb729
--- /dev/null
+++ b/paddle/fluid/operators/eigen/sign.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenSign<Eigen::DefaultDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in) {
+    out.device(dev) = in.sign();
+  }
+};
+
+template struct EigenSign<Eigen::DefaultDevice, float>;
+template struct EigenSign<Eigen::DefaultDevice, double>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/sign.cu b/paddle/fluid/operators/eigen/sign.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52c8d3c80dd2c5d0d64e9a92ae596d7b69e70476
--- /dev/null
+++ b/paddle/fluid/operators/eigen/sign.cu
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/eigen_ext.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct EigenSign<Eigen::GpuDevice, T> {
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType =
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>;
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) {
+    out.device(dev) = in.sign();
+  }
+};
+
+template struct EigenSign<Eigen::GpuDevice, float>;
+template struct EigenSign<Eigen::GpuDevice, double>;
+template struct EigenSign<Eigen::GpuDevice, platform::float16>;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/slice.cc b/paddle/fluid/operators/eigen/slice.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2579b5f07eb27817f5488d8065fa05f409d1163f
--- /dev/null
+++ b/paddle/fluid/operators/eigen/slice.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenSlice<Eigen::DefaultDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array32Bit = Eigen::DSizes<int, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType out,
+                   const InType& in, const Array& offsets,
+                   const Array& extents) {
+    out.device(dev) = in.slice(offsets, extents);
+  }
+
+  static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& offsets,
+                   const Array32Bit& extents) {
+    out.device(dev) = in.slice(offsets, extents);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                      \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 6>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 7>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 8>; \
+  template struct FUNCTOR<Eigen::DefaultDevice, TYPE, 9>
+INSTANTIATION(EigenSlice, bool);
+INSTANTIATION(EigenSlice, int);
+INSTANTIATION(EigenSlice, int8_t);
+INSTANTIATION(EigenSlice, uint8_t);
+INSTANTIATION(EigenSlice, int16_t);
+INSTANTIATION(EigenSlice, int64_t);
+INSTANTIATION(EigenSlice, float);
+INSTANTIATION(EigenSlice, double);
+INSTANTIATION(EigenSlice, platform::float16);
+INSTANTIATION(EigenSlice, platform::bfloat16);
+INSTANTIATION(EigenSlice, platform::complex<float>);
+INSTANTIATION(EigenSlice, platform::complex<double>);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/eigen/slice.cu b/paddle/fluid/operators/eigen/slice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dc51fa722202bb2d8b7fb168255a13916f3dc157
--- /dev/null
+++ b/paddle/fluid/operators/eigen/slice.cu
@@ -0,0 +1,67 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int Rank>
+struct EigenSlice<Eigen::GpuDevice, T, Rank> {
+  using Array = Eigen::DSizes<Eigen::DenseIndex, Rank>;
+  using Array32Bit = Eigen::DSizes<int, Rank>;
+  using InType = Eigen::TensorMap<
+      Eigen::Tensor<const T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using InType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<const T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+  using OutType = Eigen::TensorMap<
+      Eigen::Tensor<T, Rank, Eigen::RowMajor, Eigen::DenseIndex>>;
+  using OutType32BitIndex =
+      Eigen::TensorMap<Eigen::Tensor<T, Rank, Eigen::RowMajor, int>,
+                       Eigen::Aligned>;
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in,
+                   const Array& offsets, const Array& extents) {
+    out.device(dev) = in.slice(offsets, extents);
+  }
+
+  static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out,
+                   const InType32BitIndex& in, const Array32Bit& offsets,
+                   const Array32Bit& extents) {
+    out.device(dev) = in.slice(offsets, extents);
+  }
+};
+
+#define INSTANTIATION(FUNCTOR, TYPE)                  \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 1>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 2>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 3>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 4>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 5>; \
+  template struct FUNCTOR<Eigen::GpuDevice, TYPE, 6>
+INSTANTIATION(EigenSlice, int);
+INSTANTIATION(EigenSlice, int64_t);
+INSTANTIATION(EigenSlice, float);
+INSTANTIATION(EigenSlice, double);
+INSTANTIATION(EigenSlice, platform::float16);
+INSTANTIATION(EigenSlice, platform::bfloat16);
+INSTANTIATION(EigenSlice, platform::complex<float>);
+INSTANTIATION(EigenSlice, platform::complex<double>);
+#undef INSTANTIATION
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
index 37e5fa5a20657748804442e549baa999169836d2..aff0cb281642ecf9d9ee62890474ac87841c5e9a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -28,11 +28,11 @@ namespace operators {
    1. For Unary Op, the length of input array is 1,
       e.g. Relu: return args[0] > 0 ? args[0] : 0;
    2. For Binary Op, the length of input array is 2,
-      e.g. Add: return args[0] + args[1];
+      e.g. Add: return args[0] expr args[1];
 */
 template <typename T>
 struct CudaAddFunctor {
-  __device__ __forceinline__ T operator()(const T* args) const {
+  inline HOSTDEVICE T operator()(const T* args) const {
     return args[0] + args[1];
   }
 };
@@ -42,18 +42,12 @@ class ElementwiseAddKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* y = ctx.Input<framework::LoDTensor>("Y");
-    auto* z = ctx.Output<framework::LoDTensor>("Out");
-    z->mutable_data<T>(ctx.GetPlace());
-    int axis = ctx.Attr<int>("axis");
-    axis = axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis;
-
-    std::vector<const framework::Tensor*> ins = {x, y};
-    std::vector<framework::Tensor*> outs = {z};
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
     const auto& cuda_ctx =
         ctx.template device_context<platform::CUDADeviceContext>();
 
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
     LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
         cuda_ctx, ins, &outs, axis, CudaAddFunctor<T>());
   }
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
index ec7d036a1a1e0295ec496960069335fb33d3d003..a469ebbaec2edc9fadf0992412ef7d3b23d483e6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -72,12 +72,10 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
     if (x->dims() == y->dims()) {
-      SameDimsElemwiseAdd<platform::CPUDeviceContext, T>
-          LaunchElementwiseCpuKernel;
+      SameDimsElemwiseAdd<DeviceContext, T> LaunchElementwiseCpuKernel;
       LaunchElementwiseCpuKernel(ctx, x, y, z);
     } else {
-      LaunchBroadcastElementwiseCpuKernel<platform::CPUDeviceContext, T>(ctx, x,
-                                                                         y, z);
+      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, x, y, z);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
index 3768748931ded2a2541484bef2c8c37e72adda13..72d7e318d7b0526750ba0153c57e054247624f13 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -32,7 +32,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -82,8 +82,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
         }
         reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
         reduced_dout.mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                  {{"axes", axes}, {"keep_dims", false}});
+        const auto& runner =
+            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                        {{"axes", axes}, {"keep_dims", false}});
         runner.Run(stream);
         tmp_dout = &reduced_dout;
       }
@@ -96,8 +97,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
         }
       }
       if (axes.size() != 0) {
-        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
-                                  {{"axes", axes}, {"keep_dims", true}});
+        const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                         {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
         framework::TensorCopy(
@@ -123,8 +124,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
         }
         reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
         reduced_dout.mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                  {{"axes", axes}, {"keep_dims", false}});
+        const auto& runner =
+            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                        {{"axes", axes}, {"keep_dims", false}});
         runner.Run(stream);
         tmp_dout = &reduced_dout;
       }
@@ -138,8 +140,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
       }
       if (axes.size() != 0) {
         dy->mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
-                                  {{"axes", axes}, {"keep_dims", true}});
+        const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                         {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
         framework::TensorCopy(
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
index 8852f3a419adc51d311178175fd6f71a8c628540..4f3da27f4a67379624f5b5a66840bbc0cbac4f17 100644
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
@@ -40,7 +40,7 @@ class ElementwiseDivNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
@@ -65,46 +65,47 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
 
     Tensor y_power(y->type());
     y_power.mutable_data<T>(y->dims(), place);
-    auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power},
-                                      {{"power", static_cast<float>(-1)}});
-    y_power_runner.Run(stream);
+    const auto& runner_y_power = NpuOpRunner(
+        "Power", {*y}, {y_power}, {{"power", static_cast<float>(-1)}});
+    runner_y_power.Run(stream);
 
     if (dx) {
       dx->mutable_data<T>(place);
 
       Tensor tensor_zeros(x->type());
       tensor_zeros.mutable_data<T>(x->dims(), place);
-      auto tensor_zeros_runner =
+      const auto& runner_tensor_zeros =
           NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {});
-      tensor_zeros_runner.Run(stream);
+      runner_tensor_zeros.Run(stream);
 
       Tensor x_zero(paddle::framework::proto::VarType::BOOL);
       x_zero.mutable_data<bool>(x->dims(), place);
-      auto x_zero_runner =
+      const auto& runner_x_zero =
           NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {});
-      x_zero_runner.Run(stream);
+      runner_x_zero.Run(stream);
 
       Tensor x_nozero(paddle::framework::proto::VarType::BOOL);
       x_nozero.mutable_data<bool>(x->dims(), place);
-      auto x_nozero_runner =
+      const auto& runner_x_nonzero =
           NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {});
-      x_nozero_runner.Run(stream);
+      runner_x_nonzero.Run(stream);
 
       Tensor x_nozero_f(x->type());
       x_nozero_f.mutable_data<T>(x->dims(), place);
-      auto x_nozero_f_runner =
+      const auto& runner_x_nonzero_f =
           NpuOpRunner("Cast", {x_nozero}, {x_nozero_f},
                       {{"dst_type", static_cast<int32_t>(0)}});
-      x_nozero_f_runner.Run(stream);
+      runner_x_nonzero_f.Run(stream);
 
       Tensor x_grad_w(x->type());
       x_grad_w.mutable_data<T>(x->dims(), place);
-      auto x_grad_w_runner =
+      const auto& runner_x_grad_w =
           NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {});
-      x_grad_w_runner.Run(stream);
+      runner_x_grad_w.Run(stream);
 
-      auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
-      x_grad_runner.Run(stream);
+      const auto& runner_x_grad =
+          NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {});
+      runner_x_grad.Run(stream);
     }
 
     if (dy) {
@@ -112,16 +113,18 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel<T> {
 
       Tensor neg_out(y->type());
       neg_out.mutable_data<T>(y->dims(), place);
-      auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {});
-      neg_out_runner.Run(stream);
+      const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {});
+      runner_neg_out.Run(stream);
 
       Tensor y_grad_w(y->type());
       y_grad_w.mutable_data<T>(y->dims(), place);
-      auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
-      y_grad_w_runner.Run(stream);
+      const auto& runner_y_grad_w =
+          NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {});
+      runner_y_grad_w.Run(stream);
 
-      auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
-      y_grad_runner.Run(stream);
+      const auto& runner_y_grad =
+          NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {});
+      runner_y_grad.Run(stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
index da0116114747fa2e44045b75f3bd9bd0dc73d980..d97c04f10c497870cedbd7c42616ddf6c3431311 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
@@ -37,7 +37,7 @@ class ElementwiseFloorDivNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 5d086a1b29febd8e57507eced7683f414ca34e07..d4b5d98d5b0b345119f833e5a684d8f0b6e1f310 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMaxFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[0] : args[1]);
+  }
+};
+
+template <typename T>
+class ElementwiseMaxKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMaxFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
index 3cdb6420e8ee1d159ecd525ab6a2360544ca5323..a616d0bc9d156453c5ce09403fb4dbc27dc133e9 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
@@ -40,7 +40,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
index cf93e5a97a3f3110aae907c593f58dbab0f9d090..4a99f7e36705f0d96b200d20e880bebf5b5b2186 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct CudaMinFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return (args[0] > args[1] ? args[1] : args[0]);
+  }
+};
+
+template <typename T>
+class ElementwiseMinKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMinFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_min,
     ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
index 987c250d651475d44da7e2ebf88222b74e5b5af0..48ac3905f32bd90c8d495d7bae37b0a5cc2c15f0 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
@@ -40,7 +40,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
index 8fd4609c3aa8508687540d5424a9e91511a1a3b5..adcc18f837e670ff54459be8f47c97977269a439 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
@@ -24,37 +25,26 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    MulRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaMulFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] * args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseMul<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseMulCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseMulKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    framework::Tensor x_for_selectedrows;
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs, &x_for_selectedrows);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaMulFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
index 10e69491643c92d77f58c487abd122d51def82e5..a734f891a9d9e83592156442e48215a93af3a920 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -126,7 +126,6 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
     }
   }
 };
-
 template <typename T>
 struct MulGradDX {
   HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
index 08df6d4e27af0a79123f26ad2064ee0203cc1b28..47aa7e2521f76abe0bbbdf4c9adc4f02b43434ff 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
@@ -41,7 +41,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
@@ -65,14 +65,14 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel<T> {
 
     if (dx) {
       dx->mutable_data<T>(place);
-      auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
-      dx_runner.Run(stream);
+      const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {});
+      runner_dx.Run(stream);
     }
 
     if (dy) {
       dy->mutable_data<T>(place);
-      auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
-      dy_runner.Run(stream);
+      const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {});
+      runner_dy.Run(stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
index 1492fc629457cd5f7ca312b452ccd79ab30f175d..541ff9aacfc46247e1dee1b6fa6b1c523a9c470b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -343,7 +343,6 @@ template <typename InT, typename OutT, typename BroadcastArgsWarpper,
 __global__ void ElementwiseBroadcastKernel(
     BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
-
   // Vectorized calculation of major data whose length is the max multipler of
   // VecSize,
   // eg: Calcualting the front 1024-length data in total 1027 data once VecSize
@@ -466,7 +465,11 @@ void LaunchBroadcastElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  static_assert(ET == (ElementwiseType)2, "Only Support binary calculation.");
+  PADDLE_ENFORCE_EQ(ET, ElementwiseType::kBinary,
+                    platform::errors::InvalidArgument(
+                        "Currently, only Support binary calculation, "
+                        "but received %d input tensors.\n",
+                        static_cast<int>(ET)));
   int in_vec_size = 4;
   framework::Tensor *out = (*outs)[0];
   for (auto *in : ins) {
@@ -501,23 +504,28 @@ void LaunchBroadcastElementwiseCudaKernel(
   }
 }
 
-template <ElementwiseType ET, typename InT, typename OutType, typename Functor>
+template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchElementwiseCudaKernel(
     const platform::CUDADeviceContext &cuda_ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
+  std::vector<int> dims_size;
   bool no_broadcast_flag = true;
   for (auto *in : ins) {
     no_broadcast_flag = ins[0]->dims() == in->dims();
+    dims_size.emplace_back(in->dims().size());
   }
 
   if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kBinary, InT, OutType>(
-        cuda_ctx, ins, outs, func);
+    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
+                                                       func);
   } else {
-    LaunchBroadcastElementwiseCudaKernel<ElementwiseType::kBinary, InT,
-                                         OutType>(cuda_ctx, ins, outs, axis,
-                                                  func);
+    axis = axis == -1
+               ? *std::max_element(dims_size.begin(), dims_size.end()) -
+                     *std::min_element(dims_size.begin(), dims_size.end())
+               : axis;
+    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
+                                                        axis, func);
   }
 }
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 32e49cf3996f120d2e2a8f909883e0c46f7b1352..d09e777670990a818c7a07bba9b800b1cb331566 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -60,6 +60,71 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024;
 namespace paddle {
 namespace operators {
 
+/*
+*  Pack input and output tensors into respective vectors with
+*  consideration of varible X`s class type.
+*  Input variable X is supported to be whether LoDTensor or
+*  SelectedRows class type in this package function, once X
+*  was SelectedRows type, a valid pointer x_for_selectedrows
+*  is excepted to be passed in from op kernel for acquisition
+*  of the valid address of LoDTensor created ahead in the function.
+*/
+template <typename OutT>
+int PackTensorsIntoVector(const framework::ExecutionContext &ctx,
+                          std::vector<const framework::Tensor *> *ins,
+                          std::vector<framework::Tensor *> *outs,
+                          framework::Tensor *x_for_selectedrows = nullptr) {
+  int axis = -1;
+  auto x_var = ctx.InputVar("X");
+  PADDLE_ENFORCE_NOT_NULL(
+      x_var, platform::errors::InvalidArgument(
+                 "Unable to get input Variable X, Variable name is %s.\n",
+                 ctx.InputName("X")));
+  auto *y = ctx.Input<framework::LoDTensor>("Y");
+  framework::Tensor *z;
+
+  if (x_var->IsType<framework::LoDTensor>()) {
+    auto *x = ctx.Input<framework::LoDTensor>("X");
+    z = ctx.Output<framework::LoDTensor>("Out");
+    ins->emplace_back(x);
+  } else if (x_var->IsType<framework::SelectedRows>()) {
+    PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true,
+                      platform::errors::InvalidArgument(
+                          "For elementwise_op, if X is Sparse, Y must be "
+                          "scalar. But reveived the size of Y = %d.",
+                          y->dims().size()));
+    PADDLE_ENFORCE_NOT_NULL(
+        x_for_selectedrows,
+        platform::errors::InvalidArgument(
+            "The parameter x_for_selectedrows is excepted to "
+            "be valid, once input varible X`s class type is "
+            "SelectedRows.\n"));
+    auto &x_sele = x_var->Get<framework::SelectedRows>();
+    auto out_sele = ctx.Output<framework::SelectedRows>("Out");
+    *x_for_selectedrows = x_sele.value();
+    out_sele->set_rows(x_sele.rows());
+    out_sele->set_height(x_sele.height());
+    out_sele->mutable_value()->Resize(x_sele.value().dims());
+    out_sele->mutable_value()->mutable_data(ctx.GetPlace(),
+                                            x_for_selectedrows->type());
+    z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
+    ins->emplace_back(x_for_selectedrows);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "X's type[%s] is not supported by elementwise_op. X's type should be "
+        "LoDTensor or SelectedRows.",
+        framework::ToTypeName(x_var->Type())));
+  }
+  z->mutable_data<OutT>(ctx.GetPlace());
+  outs->emplace_back(z);
+
+  if (y != nullptr) {
+    ins->emplace_back(y);
+    axis = ctx.HasAttr("axis") ? ctx.Attr<int>("axis") : -1;
+  }
+  return axis;
+}
+
 /*
  * Out = X ⊙ Y
  * If Y's shape does not match X' shape, they will be reshaped.
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
index 320d1e7b38da8e4f77015ef2b7bcc73e5db7675f..5335f274ef126f228694d1bfb23cb15f6da158ee 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -8,10 +8,52 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 
 namespace ops = paddle::operators;
 
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaPowFunctor {
+  inline HOSTDEVICE T operator()(const T args[]) const {
+    return std::pow(args[0], args[1]);
+  }
+};
+
+template <typename T>
+struct CudaPowFunctor<
+    T, typename std::enable_if<std::is_integral<T>::value>::type> {
+  // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+  // it will return a float number like 2.99... , which floor to 2
+  // when cast to int by default and it is wrong.
+  // Use llrint to cast it to the nearest integer, which is 3.
+  inline HOSTDEVICE T operator()(const T args[]) const {
+    return std::llrint(std::pow(args[0], args[1]));
+  }
+};
+
+template <typename T>
+class ElementwisePowKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaPowFunctor<T>());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
 REGISTER_OP_CUDA_KERNEL(
     elementwise_pow,
     ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
index 26cc925b869c647d5a02215c8c8621782cdf2303..e0763d769f047a963ea8e4905a9e79e1b583703a 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
@@ -40,7 +40,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 19cbbb7bf04287b49e023aaa10c9635b6c4fbda7..da9610243f7c4df3300b3ea8b9137cea84e5c72b 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -11,8 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
@@ -24,37 +23,25 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, T> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    SubRangeFunctor<T> functor(x->data<T>(), y->data<T>(), z->data<T>());
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx,
-                                                              x->numel());
-    for_range(functor);
+struct CudaSubFunctor {
+  inline HOSTDEVICE T operator()(const T* args) const {
+    return args[0] - args[1];
   }
 };
 
-template <>
-struct SameDimsElemwiseSub<platform::CUDADeviceContext, platform::float16> {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z) {
-    auto size = x->numel();
-    dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) /
-                              PADDLE_CUDA_THREAD_SIZE,
-                          1);
-    dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1);
-    const half* x2 =
-        reinterpret_cast<const half*>(x->data<platform::float16>());
-    const half* y2 =
-        reinterpret_cast<const half*>(y->data<platform::float16>());
-    half* z2 = reinterpret_cast<half*>(z->data<platform::float16>());
-    SameDimsElemwiseSubCUDAKernel<<<
-        grid_size, block_size, 0,
-        ctx.template device_context<platform::CUDADeviceContext>().stream()>>>(
-        x2, y2, z2, size);
+template <typename T>
+class ElementwiseSubKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    std::vector<const framework::Tensor*> ins;
+    std::vector<framework::Tensor*> outs;
+    const auto& cuda_ctx =
+        ctx.template device_context<platform::CUDADeviceContext>();
+
+    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        cuda_ctx, ins, &outs, axis, CudaSubFunctor<T>());
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index 4171d2eb9e5e53ea2fff9a2ab7521f2e5c4ae438..426093413276092538c67676abb2c1e9b7f637ed 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #pragma once
+
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
index a6e438f8016e0cd4c8fccee6c664d509b8c170eb..94e78defbbee5d767194dd403a176574008f03ac 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
@@ -33,7 +33,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
+    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {});
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -84,8 +84,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         }
         reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
         reduced_dout.mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                  {{"axes", axes}, {"keep_dims", false}});
+        const auto& runner =
+            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                        {{"axes", axes}, {"keep_dims", false}});
         runner.Run(stream);
         tmp_dout = &reduced_dout;
       }
@@ -98,8 +99,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         }
       }
       if (axes.size() != 0) {
-        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
-                                  {{"axes", axes}, {"keep_dims", true}});
+        const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                         {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
       } else {
         framework::TensorCopy(
@@ -127,8 +128,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
         }
         reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
         reduced_dout.mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
-                                  {{"axes", axes}, {"keep_dims", false}});
+        const auto& runner =
+            NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                        {{"axes", axes}, {"keep_dims", false}});
         runner.Run(stream);
         tmp_dout = &reduced_dout;
       }
@@ -144,14 +146,15 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
       if (axes.size() != 0) {
         reduced_dy.Resize(dy->dims());
         reduced_dy.mutable_data<T>(ctx.GetPlace());
-        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
-                                  {{"axes", axes}, {"keep_dims", true}});
+        const auto& runner =
+            NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
+                        {{"axes", axes}, {"keep_dims", true}});
         runner.Run(stream);
         tmp_dy = &reduced_dy;
       }
 
       // stage 3, negative
-      auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
+      const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc
index 09cdf4d8b2a0dd3b445dc5215dd86b8b1963196e..f68f670394871114369f8b05b7f958c03d5508d0 100644
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
@@ -130,3 +130,14 @@ REGISTER_OP_CPU_KERNEL(
     ops::ErfGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ErfGradKernel<paddle::platform::CPUDeviceContext,
                        paddle::platform::float16>);
+
+REGISTER_OP_CUDA_KERNEL(
+    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ErfKernel<paddle::platform::CUDADeviceContext,
+                   paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.cu b/paddle/fluid/operators/erf_op.cu
deleted file mode 100644
index 357b9e79c4e72854549f11ab49735fac65a400be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/erf_op.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/erf_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    erf, ops::ErfKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfKernel<paddle::platform::CUDADeviceContext,
-                   paddle::platform::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    erf_grad, ops::ErfGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::ErfGradKernel<paddle::platform::CUDADeviceContext,
-                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h
index 08c827df95d9bfa4f01f3c7af9e657b7b3a360a8..4780b2e7f5b28d4a743f6d35046891b30cbefd00 100644
--- a/paddle/fluid/operators/erf_op.h
+++ b/paddle/fluid/operators/erf_op.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <cmath>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +36,8 @@ class ErfKernel : public framework::OpKernel<T> {
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_in.erf();
+    EigenErf<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
+                                                     eigen_in);
   }
 };
 
@@ -55,8 +57,8 @@ class ErfGradKernel : public framework::OpKernel<T> {
     auto eigen_dx = framework::EigenVector<T>::Flatten(*dx);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_dx.device(place) =
-        eigen_dout * static_cast<T>(M_2_SQRTPI) * (-(eigen_x.square())).exp();
+    EigenErfGrad<std::decay_t<decltype(place)>, T>::Eval(place, eigen_dx,
+                                                         eigen_x, eigen_dout);
   }
 };
 
diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc
index bb3a6512d2c8ba3b5f0d643a5ae6d906a00717c3..76d5a203f306b9b9773af50d5de5db7b6c89ae5e 100644
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
@@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <iostream>
 #include <memory>
 #include <string>
@@ -65,7 +64,7 @@ class ExpandNPUKernel : public framework::OpKernel<T> {
 
     out0->Resize(out_dims);
     out0->mutable_data<T>(context.device_context().GetPlace());
-    auto runner =
+    const auto& runner =
         NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}});
     auto stream =
         context.template device_context<paddle::platform::NPUDeviceContext>()
@@ -82,5 +81,3 @@ REGISTER_OP_NPU_KERNEL(
     ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext, int>,
     ops::ExpandNPUKernel<paddle::platform::NPUDeviceContext,
                          paddle::platform::float16>);
-
-#endif
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 4e576730edc6b892d91449ca92427f5c76f85f70..2a3e1e35f457114328833f4df129c473927ed1b5 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -152,16 +152,15 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
-                       ops::FillConstantKernel<double>,
-                       ops::FillConstantKernel<uint8_t>,
-                       ops::FillConstantKernel<int64_t>,
-                       ops::FillConstantKernel<int>,
-                       ops::FillConstantKernel<bool>,
-                       ops::FillConstantKernel<paddle::platform::float16>,
-                       ops::FillConstantKernel<paddle::platform::bfloat16>,
-                       ops::FillConstantKernel<paddle::platform::complex64>,
-                       ops::FillConstantKernel<paddle::platform::complex128>);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant, ops::FillConstantKernel<float>,
+    ops::FillConstantKernel<double>, ops::FillConstantKernel<uint8_t>,
+    ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<int>,
+    ops::FillConstantKernel<bool>,
+    ops::FillConstantKernel<paddle::platform::float16>,
+    ops::FillConstantKernel<paddle::platform::bfloat16>,
+    ops::FillConstantKernel<paddle::platform::complex<float>>,
+    ops::FillConstantKernel<paddle::platform::complex<double>>);
 
 REGISTER_OP_VERSION(fill_constant)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc
index e784c20b8b8b4f9fa61b3bcebf481a989d4bb033..a862cda13888ee7086d8ce17511b9851a36d18a6 100644
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/fill_constant_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel<float>,
-                        ops::FillConstantKernel<double>,
-                        ops::FillConstantKernel<uint8_t>,
-                        ops::FillConstantKernel<int64_t>,
-                        ops::FillConstantKernel<int>,
-                        ops::FillConstantKernel<bool>,
-                        ops::FillConstantKernel<paddle::platform::float16>,
-                        ops::FillConstantKernel<paddle::platform::complex64>,
-                        ops::FillConstantKernel<paddle::platform::complex128>);
+REGISTER_OP_CUDA_KERNEL(
+    fill_constant, ops::FillConstantKernel<float>,
+    ops::FillConstantKernel<double>, ops::FillConstantKernel<uint8_t>,
+    ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<int>,
+    ops::FillConstantKernel<bool>,
+    ops::FillConstantKernel<paddle::platform::float16>,
+    ops::FillConstantKernel<paddle::platform::complex<float>>,
+    ops::FillConstantKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc
index 4ea4c11c478357aa7ca98fc0de4467bae7100a87..2626e6d960f8e952a722eb6a31b995c829610c5e 100644
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -68,8 +68,8 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
     FillNpuTensorWithConstant<T>(&tensor_tmp, value);
 
     out_var->mutable_data<T>(shape, place);
-    auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
-                              {{"dims", framework::vectorize(shape)}});
+    const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
+                                     {{"dims", framework::vectorize(shape)}});
     runner.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/fill_constant_op_xpu.cc b/paddle/fluid/operators/fill_constant_op_xpu.cc
index 16dd4c9292f89a05d58cfc1d821c5a43f45f5add..d55b8e2b81b52f173dc2f8f158a2f42ae7abd7eb 100644
--- a/paddle/fluid/operators/fill_constant_op_xpu.cc
+++ b/paddle/fluid/operators/fill_constant_op_xpu.cc
@@ -15,11 +15,10 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 #ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
-                       ops::FillConstantKernel<int64_t>,
-                       ops::FillConstantKernel<double>,
-                       ops::FillConstantKernel<bool>,
-                       ops::FillConstantKernel<int>,
-                       ops::FillConstantKernel<paddle::platform::complex64>,
-                       ops::FillConstantKernel<paddle::platform::complex128>);
+REGISTER_OP_XPU_KERNEL(
+    fill_constant, ops::FillConstantKernel<float>,
+    ops::FillConstantKernel<int64_t>, ops::FillConstantKernel<double>,
+    ops::FillConstantKernel<bool>, ops::FillConstantKernel<int>,
+    ops::FillConstantKernel<paddle::platform::complex<float>>,
+    ops::FillConstantKernel<paddle::platform::complex<double>>);
 #endif
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 1ee8889995f4d6045f237aa51e00faff7f67b2a3..7c6dd418071ba30e94f9316cb9f9fbd0641e1619 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -33,8 +33,8 @@ class GatherOpNPUKernel : public framework::OpKernel<T> {
     auto *out = ctx.Output<Tensor>("Out");
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("Gather", {*x, *index}, {*out},
-                              {{"validate_indices", true}});
+    const auto &runner = NpuOpRunner("Gather", {*x, *index}, {*out},
+                                     {{"validate_indices", true}});
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -75,7 +75,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
                              zeroslike_xout.numel() * sizeof(T), stream);
 
     // step3: scatter(x_grad)
-    auto runner_scatter = NpuOpRunner(
+    const auto &runner_scatter = NpuOpRunner(
         "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
     runner_scatter.Run(stream);
   }
diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc
index 56aa509177cfd3e5ecfd521e0b66fd72fc708c38..6e60926cc7951aa777138bb8785083eb48ee50dd 100644
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
@@ -39,7 +39,7 @@ class GeluNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {});
     runner.Run(stream);
   }
 };
@@ -63,11 +63,12 @@ class GeluGradNPUKernel : public framework::OpKernel<T> {
 
     Tensor out(x->type());
     out.mutable_data<T>(x->dims(), place);
-    auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {});
-    out_runner.Run(stream);
+    const auto& runner_out = NpuOpRunner("Gelu", {*x}, {out}, {});
+    runner_out.Run(stream);
 
-    auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
-    dx_runner.Run(stream);
+    const auto& runner_dx =
+        NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {});
+    runner_dx.Run(stream);
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index e9b0a0108afc2336aa3bf350173ea4fa38635593..762d14096a5ab4d094894ad7c0ec822f5cc25d3b 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -187,7 +187,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
   int out_sC = out_h * out_w;
   int out_sH = out_w;
   int out_sW = 1;
-
   CUDA_KERNEL_LOOP(index, nthreads) {
     const int w = index % out_w;
     const int h = (index / out_w) % out_h;
@@ -199,7 +198,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
 
     ix = compute_positions(ix, in_w, padding_mode, align_corners);
     iy = compute_positions(iy, in_h, padding_mode, align_corners);
-
     if (mode == Mode::bilinear) {
       int ix_nw = static_cast<int>(floor(ix));
       int iy_nw = static_cast<int>(floor(iy));
@@ -216,6 +214,7 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c,
       T se = (ix - ix_nw) * (iy - iy_nw);
 
       auto inp_offset_NC = n * inp_sN;
+
       auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
       for (int c = 0; c < out_c;
            ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
@@ -291,17 +290,17 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
             << "; out_w: " << out_w;
     auto* output = ctx.Output<Tensor>("Output");
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
-
-    VLOG(3) << "set constant";
+    VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
+            << "; " << output->dims()[2] << "; " << output->dims()[3];
     math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
-
     auto cu_stream = dev_ctx.stream();
-
-    int block = 512;
-    int grid_size = (count + block - 1) / block;
-    grid_sample_cuda_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+    int block_size = 512;
+    int grid_size = (count + block_size - 1) / block_size;
+    VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims"
+            << block_size;
+    grid_sample_cuda_kernel<T><<<grid_size, block_size, 0, cu_stream>>>(
         count, n, c, out_h, out_w, in_h, in_w, input->data<T>(),
         grid->data<T>(), output_data, mode, padding_mode, align_corners);
   }
@@ -475,9 +474,12 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
 
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
-    int block = 512;
-    int grid_size = (count + block - 1) / block;
-    grid_sampler_cuda_backward_kernel<T><<<block, grid_size, 0, cu_stream>>>(
+    int block_size = 512;
+    int grid_size = (count + block_size - 1) / block_size;
+    VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size
+            << "; block dims" << block_size << "; count: " << count;
+    grid_sampler_cuda_backward_kernel<
+        T><<<grid_size, block_size, 0, cu_stream>>>(
         count, output_grad->data<T>(), input->data<T>(), grid->data<T>(), n, c,
         out_h, out_w, in_h, in_w, input_grad->data<T>(), grid_grad_data, mode,
         padding_mode, align_corners);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index afe70ea64a99977737333168ab7ccff154d57668..2f0edd0451a3b76aa25a38de5febbabd70cf838d 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <array>
+#include <numeric>
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -73,6 +75,11 @@ class GroupNormKernel : public framework::OpKernel<T> {
     auto* iter_y_data = y_data;
     for (int bid = 0; bid < x_dims[0]; bid++) {
       for (int gid = 0; gid < groups; gid++) {
+        const int64_t M = 8;
+        std::array<T, M> x_mean_arr;
+        std::array<T, M> x_var_arr;
+        std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+        std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
         T x_mean = 0, x_var = 0;
         int number =
             std::min(group_size, static_cast<int>(C - gid * group_size));
@@ -83,7 +90,37 @@ class GroupNormKernel : public framework::OpKernel<T> {
 
         if (data_layout == DataLayout::kNCHW) {
           for (int cid = 0; cid < number; cid++) {
-            for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            int imid;
+            for (imid = 0; imid < imsize - (imsize % M);
+                 imid += M, iter_x_data += M) {
+              // TODO(gaoxiang) ：Because AVX/AVX2/AVX512 can not directly used
+              // in template class/function, before we complete high
+              // performance cpu vector extension, temporarily unrolling
+              // loop to get high precision and performance
+              x_mean_arr[0] += iter_x_data[0];
+              x_var_arr[0] += iter_x_data[0] * iter_x_data[0];
+              x_mean_arr[1] += iter_x_data[1];
+              x_var_arr[1] += iter_x_data[1] * iter_x_data[1];
+              x_mean_arr[2] += iter_x_data[2];
+              x_var_arr[2] += iter_x_data[2] * iter_x_data[2];
+              x_mean_arr[3] += iter_x_data[3];
+              x_var_arr[3] += iter_x_data[3] * iter_x_data[3];
+              x_mean_arr[4] += iter_x_data[4];
+              x_var_arr[4] += iter_x_data[4] * iter_x_data[4];
+              x_mean_arr[5] += iter_x_data[5];
+              x_var_arr[5] += iter_x_data[5] * iter_x_data[5];
+              x_mean_arr[6] += iter_x_data[6];
+              x_var_arr[6] += iter_x_data[6] * iter_x_data[6];
+              x_mean_arr[7] += iter_x_data[7];
+              x_var_arr[7] += iter_x_data[7] * iter_x_data[7];
+            }
+            x_mean =
+                std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
+            x_var =
+                std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
+            std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+            std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
+            for (; imid < imsize; imid++, iter_x_data++) {
               x_mean += iter_x_data[0];
               x_var += iter_x_data[0] * iter_x_data[0];
             }
@@ -91,7 +128,37 @@ class GroupNormKernel : public framework::OpKernel<T> {
         } else {
           for (int cid = 0; cid < number; cid++) {
             iter_x_data = tmp_x + cid;
-            for (int imid = 0; imid < imsize; imid++, iter_x_data += C) {
+            int imid;
+            for (imid = 0; imid < imsize - (imsize % M);
+                 imid += M, iter_x_data += M * C) {
+              // TODO(gaoxiang) ：Because AVX/AVX2/AVX512 can not directly used
+              // in template class/function, before we complete high
+              // performance cpu vector extension, temporarily unrolling
+              // loop to get high precision and performance
+              x_mean_arr[0] += iter_x_data[0 * C];
+              x_var_arr[0] += iter_x_data[0 * C] * iter_x_data[0 * C];
+              x_mean_arr[1] += iter_x_data[1 * C];
+              x_var_arr[1] += iter_x_data[1 * C] * iter_x_data[1 * C];
+              x_mean_arr[2] += iter_x_data[2 * C];
+              x_var_arr[2] += iter_x_data[2 * C] * iter_x_data[2 * C];
+              x_mean_arr[3] += iter_x_data[3 * C];
+              x_var_arr[3] += iter_x_data[3 * C] * iter_x_data[3 * C];
+              x_mean_arr[4] += iter_x_data[4 * C];
+              x_var_arr[4] += iter_x_data[4 * C] * iter_x_data[4 * C];
+              x_mean_arr[5] += iter_x_data[5 * C];
+              x_var_arr[5] += iter_x_data[5 * C] * iter_x_data[5 * C];
+              x_mean_arr[6] += iter_x_data[6 * C];
+              x_var_arr[6] += iter_x_data[6 * C] * iter_x_data[6 * C];
+              x_mean_arr[7] += iter_x_data[7 * C];
+              x_var_arr[7] += iter_x_data[7 * C] * iter_x_data[7 * C];
+            }
+            x_mean =
+                std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
+            x_var =
+                std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
+            std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+            std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
+            for (; imid < imsize; imid++, iter_x_data += C) {
               x_mean += iter_x_data[0];
               x_var += iter_x_data[0] * iter_x_data[0];
             }
@@ -101,8 +168,8 @@ class GroupNormKernel : public framework::OpKernel<T> {
 
         x_mean /= number * imsize;
         x_var /= number * imsize;
-        x_var = x_var - x_mean * x_mean;
-        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        x_var = std::max(x_var - x_mean * x_mean, T(0));
+        T var_inv = T(1) / std::sqrt(x_var + epsilon);
         mean_data[bid * groups + gid] = x_mean;
         var_data[bid * groups + gid] = x_var;
 
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index e60b1538eee64e9eae7bdae8b7b1d6117c80d229..cce80518354d75b9caa61462a2d3cefb3fa47627 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -143,3 +143,10 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     hinge_loss_grad,
     ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/hinge_loss_op.cu
deleted file mode 100644
index b5ea0a702e0e540c1831ca241a5def19f86c239c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/hinge_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss,
-    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    hinge_loss_grad,
-    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h
index 10c17a0982fd7995056aeb1f70648fd78b3d9c05..c78eddd2528117035085d7ada63bfde5798562dc 100644
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -33,9 +34,7 @@ class HingeLossKernel : public framework::OpKernel<T> {
     auto y = framework::EigenVector<T>::Flatten(*label);
     loss->mutable_data<T>(context.GetPlace());
     auto l = framework::EigenVector<T>::Flatten(*loss);
-    l.device(place) =
-        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
-            .cwiseMax(static_cast<T>(0));
+    EigenHingeLoss<std::decay_t<decltype(place)>, T>::Eval(place, l, x, y);
   }
 };
 
@@ -59,10 +58,8 @@ class HingeLossGradKernel : public framework::OpKernel<T> {
     if (dpred) {
       dpred->mutable_data<T>(context.GetPlace());
       auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
-      dx.device(place) =
-          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
-          (-alt_labels);
+      EigenHingeLossGrad<std::decay_t<decltype(place)>, T>::Eval(place, dx, dl,
+                                                                 x, y);
     }
   }
 };
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index b973d5d9d8fe16ffb0faab83576bd5f71a16474c..d248857b8f42fb9e8a6c8a0ac60546a390597714 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -192,3 +192,10 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     im2sequence_grad,
     ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu
deleted file mode 100644
index 1c34640618d58d3b5fe627fa6596260a7b687d05..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-#include "paddle/fluid/operators/im2sequence_op.h"
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence,
-    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    im2sequence_grad,
-    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 9c9069b722763d0ec0d39d2f6fb35477c7578f30..760d6a63de13ac72a578e565c1bea8fc58130eb9 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -157,7 +158,7 @@ class Im2SequenceGradKernel : public framework::OpKernel<T> {
 
     auto x_v = framework::EigenVector<T>::Flatten(*d_x);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    x_v.device(place) = x_v.constant(0.0);
+    EigenConstant<std::decay_t<decltype(place)>, T, 1>::Eval(place, x_v, 0.0);
 
     auto in_dim = in->dims();
     int batch_size = in_dim[0];
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index e8edfb99f9f306d7057afcdf935cad5a5e4a73d6..e727f6ceb56f7e53d5828dad5bde8d11f05df379 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -107,3 +107,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, double>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, int>,
     ops::IncrementKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h
index d0e8c66255ef68b975701fb6b3c145be2590e271..4b9d07146484ff00ba105b9971f40f91dd8148de 100644
--- a/paddle/fluid/operators/increment_op.h
+++ b/paddle/fluid/operators/increment_op.h
@@ -15,6 +15,7 @@
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -30,8 +31,9 @@ class IncrementKernel : public framework::OpKernel<T> {
     out_tensor->mutable_data<T>(context.GetPlace());
     auto& dev =
         *context.template device_context<DeviceContext>().eigen_device();
-    framework::EigenScalar<T>::From(*out_tensor).device(dev) =
-        framework::EigenScalar<T>::From(*x_tensor) + static_cast<T>(step);
+    EigenAdd<std::decay_t<decltype(dev)>, T>::Eval(
+        dev, framework::EigenScalar<T>::From(*out_tensor),
+        framework::EigenScalar<T>::From(*x_tensor), static_cast<T>(step));
   }
 };
 
diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc
index 7d75e385e8f3b7c88c393c7195b49e17397f08aa..35ebe92b364d3cf241c3778687b0d4123700c56b 100644
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
@@ -43,7 +43,7 @@ class IncrementalNPUKernel : public framework::OpKernel<T> {
     step_tensor.mutable_data<T>({1}, context.GetPlace());
     FillNpuTensorWithConstant<T>(&step_tensor, static_cast<T>(step));
 
-    auto runner =
+    const auto& runner =
         NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {});
 
     auto stream =
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index dab9948edc3592e8c1635c5bb62b7dfbd09dd1e1..308330313a976997df9547abc9db6ec091718543 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -18,8 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -185,9 +184,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::KronKernel<paddle::platform::CPUDeviceContext, int>,
     ops::KronKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex64>,
+                    paddle::platform::complex<float>>,
     ops::KronKernel<paddle::platform::CPUDeviceContext,
-                    paddle::platform::complex128>);
+                    paddle::platform::complex<double>>);
 
 REGISTER_OPERATOR(kron_grad, ops::KronGradOp);
 REGISTER_OP_CPU_KERNEL(
@@ -198,6 +197,6 @@ REGISTER_OP_CPU_KERNEL(
     ops::KronGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::KronGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex64>,
+                        paddle::platform::complex<float>>,
     ops::KronGradKernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex128>);
+                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu
index a348cb2e1759e8ad8c2f70c7c25478c94e35e786..e5124e65007509568ae8cd8ab65b33c504a12fe9 100644
--- a/paddle/fluid/operators/kron_op.cu
+++ b/paddle/fluid/operators/kron_op.cu
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/kron_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace ops = paddle::operators;
@@ -26,9 +25,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::KronKernel<paddle::platform::CUDADeviceContext, int>,
     ops::KronKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex64>,
+                    paddle::platform::complex<float>>,
     ops::KronKernel<paddle::platform::CUDADeviceContext,
-                    paddle::platform::complex128>);
+                    paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     kron_grad, ops::KronGradKernel<paddle::platform::CUDADeviceContext, float>,
@@ -38,6 +37,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::KronGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::KronGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
     ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex64>,
+                        paddle::platform::complex<float>>,
     ops::KronGradKernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex128>);
+                        paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h
index 6815fd460fa1f1969c9bf01f733f30b941fd8799..6c3bad4e1bdcd538f6d0ba4d94fd0ced374aaf23 100644
--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
@@ -26,9 +26,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
-
 // Process an element in the output, used with a parallel-for
 template <typename T>
 struct KronElemFunctor {
@@ -175,72 +172,13 @@ struct KronGradElemFunctor {
   const int ndims_;
 };
 
-template <>
-struct KronGradElemFunctor<complex64> {
-  KronGradElemFunctor(const complex64* dout, const complex64* A,
-                      const complex64* B, complex64* dout_a, complex64* dout_b,
-                      const int64_t* stride_dout, const int64_t* stride_a,
-                      const int64_t* stride_b, const int64_t* shape_b,
-                      const int64_t numel_a, const int64_t numel_b,
-                      const int ndims)
-      : dout_(dout),
-        A_(A),
-        B_(B),
-        dout_a_(dout_a),
-        dout_b_(dout_b),
-        stride_dout_(stride_dout),
-        stride_a_(stride_a),
-        stride_b_(stride_b),
-        shape_b_(shape_b),
-        numel_a_(numel_a),
-        numel_b_(numel_b),
-        ndims_(ndims) {}
-
-  HOSTDEVICE void operator()(int64_t idx) {
-    int64_t index = idx;
-    int64_t index_a = 0;
-    int64_t index_b = 0;
-    for (int i = 0; i < ndims_; i++) {
-      auto pos_i = index / stride_dout_[i];
-      index = index % stride_dout_[i];
-      auto pos_ai = pos_i / shape_b_[i];
-      auto pos_bi = pos_i % shape_b_[i];
-      index_a += stride_a_[i] * pos_ai;
-      index_b += stride_b_[i] * pos_bi;
-    }
-
-    if (dout_a_) {
-      size_t index_out_a = index_a * numel_b_ + index_b;
-      dout_a_[index_out_a] =
-          dout_[idx] * complex64(B_[index_b].real, -B_[index_b].imag);
-    }
-    if (dout_b_) {
-      size_t index_out_b = index_b * numel_a_ + index_a;
-      dout_b_[index_out_b] =
-          dout_[idx] * complex64(A_[index_a].real, -A_[index_a].imag);
-    }
-  }
-
- private:
-  const complex64* dout_;
-  const complex64* A_;
-  const complex64* B_;
-  complex64* dout_a_;
-  complex64* dout_b_;
-  const int64_t* stride_dout_;
-  const int64_t* stride_a_;
-  const int64_t* stride_b_;
-  const int64_t* shape_b_;
-  const int64_t numel_a_;
-  const int64_t numel_b_;
-  const int ndims_;
-};
-
-template <>
-struct KronGradElemFunctor<complex128> {
-  KronGradElemFunctor(const complex128* dout, const complex128* A,
-                      const complex128* B, complex128* dout_a,
-                      complex128* dout_b, const int64_t* stride_dout,
+template <typename T>
+struct KronGradElemFunctor<platform::complex<T>> {
+  KronGradElemFunctor(const platform::complex<T>* dout,
+                      const platform::complex<T>* A,
+                      const platform::complex<T>* B,
+                      platform::complex<T>* dout_a,
+                      platform::complex<T>* dout_b, const int64_t* stride_dout,
                       const int64_t* stride_a, const int64_t* stride_b,
                       const int64_t* shape_b, const int64_t numel_a,
                       const int64_t numel_b, const int ndims)
@@ -273,21 +211,23 @@ struct KronGradElemFunctor<complex128> {
     if (dout_a_) {
       size_t index_out_a = index_a * numel_b_ + index_b;
       dout_a_[index_out_a] =
-          dout_[idx] * complex128(B_[index_b].real, -B_[index_b].imag);
+          dout_[idx] *
+          platform::complex<T>(B_[index_b].real, -B_[index_b].imag);
     }
     if (dout_b_) {
       size_t index_out_b = index_b * numel_a_ + index_a;
       dout_b_[index_out_b] =
-          dout_[idx] * complex128(A_[index_a].real, -A_[index_a].imag);
+          dout_[idx] *
+          platform::complex<T>(A_[index_a].real, -A_[index_a].imag);
     }
   }
 
  private:
-  const complex128* dout_;
-  const complex128* A_;
-  const complex128* B_;
-  complex128* dout_a_;
-  complex128* dout_b_;
+  const platform::complex<T>* dout_;
+  const platform::complex<T>* A_;
+  const platform::complex<T>* B_;
+  platform::complex<T>* dout_a_;
+  platform::complex<T>* dout_b_;
   const int64_t* stride_dout_;
   const int64_t* stride_a_;
   const int64_t* stride_b_;
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index e8f83f6b62221b9db14734917a1a2e44d8295f6e..ddd0554add5105b0e682c6cb2e42ac4ec936c448 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -91,3 +91,9 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     l1_norm_grad,
     ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/l1_norm_op.cu
deleted file mode 100644
index a5c29bbf5debdd11f6e5b28b3a8b48c2c484517a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/l1_norm_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    l1_norm_grad,
-    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h
index c2a302ed05f1c63864629665110e29c60cedb796..918526914d95d8a91d121b7c17629c10ab4dee16 100644
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -33,7 +34,7 @@ class L1NormKernel : public framework::OpKernel<T> {
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    out.device(place) = x.abs().sum();
+    EigenL1Norm<std::decay_t<decltype(place)>, T>::Eval(place, out, x);
   }
 };
 
@@ -59,8 +60,9 @@ class L1NormGradKernel : public framework::OpKernel<T> {
     auto &place =
         *context.template device_context<DeviceContext>().eigen_device();
 
-    Eigen::DSizes<int, 1> x_dsize(x->numel());
-    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
+    Eigen::DSizes<Eigen::DenseIndex, 1> x_dsize(x->numel());
+    EigenL1NormGrad<std::decay_t<decltype(place)>, T>::Eval(
+        place, dx_eigen, d_out_eigen, x_eigen, x_dsize);
   }
 };
 
diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc
index c0c228ef22af3e24f1ea6e1bc8607cda718ed40e..4aafe2856605e140aa9bd154c9183682b63eca6b 100644
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
@@ -81,7 +81,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      auto runner =
+      const auto& runner =
           NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
       runner.Run(stream);
       scale = &default_scale;
@@ -95,7 +95,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(0));
-      auto runner =
+      const auto& runner =
           NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}});
       runner.Run(stream);
       bias = &default_bias;
@@ -110,7 +110,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       cast_scale.Resize(scale->dims());
       cast_scale.mutable_data<T>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(x->type());
-      auto runner_cast_scale =
+      const auto& runner_cast_scale =
           NpuOpRunner("Cast", {*scale}, {cast_scale},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_scale.Run(stream);
@@ -125,7 +125,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       cast_bias.Resize(bias->dims());
       cast_bias.mutable_data<T>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(x->type());
-      auto runner_cast_bias =
+      const auto& runner_cast_bias =
           NpuOpRunner("Cast", {*bias}, {cast_bias},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_bias.Run(stream);
@@ -163,18 +163,18 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
       variance->mutable_data<T>(ctx.GetPlace());
     }
 
-    auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
-                              {*y, *tmp_mean, *tmp_variance},
-                              {{"begin_norm_axis", begin_norm_axis},
-                               {"begin_params_axis", begin_norm_axis},
-                               {"epsilon", epsilon}});
+    const auto& runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias},
+                                     {*y, *tmp_mean, *tmp_variance},
+                                     {{"begin_norm_axis", begin_norm_axis},
+                                      {"begin_params_axis", begin_norm_axis},
+                                      {"epsilon", epsilon}});
     runner.Run(stream);
 
     // cast back from FP16 to FP32
     if (x->type() == framework::proto::VarType::FP16 &&
         mean->type() == framework::proto::VarType::FP32) {
       auto dst_dtype = ConvertToNpuDtype(mean->type());
-      auto runner_cast_mean =
+      const auto& runner_cast_mean =
           NpuOpRunner("Cast", {*tmp_mean}, {*mean},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_mean.Run(stream);
@@ -183,7 +183,7 @@ class LayerNormNPUKernel : public framework::OpKernel<T> {
     if (x->type() == framework::proto::VarType::FP16 &&
         variance->type() == framework::proto::VarType::FP32) {
       auto dst_dtype = ConvertToNpuDtype(variance->type());
-      auto runner_cast_variance =
+      const auto& runner_cast_variance =
           NpuOpRunner("Cast", {*tmp_variance}, {*variance},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_variance.Run(stream);
@@ -250,7 +250,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       Tensor value(x->type());
       value.mutable_data<T>({1}, place);
       FillNpuTensorWithConstant<T>(&value, static_cast<T>(1.0));
-      auto runner =
+      const auto& runner =
           NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}});
       runner.Run(stream);
       scale = &default_scale;
@@ -265,7 +265,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       cast_scale.Resize(scale->dims());
       cast_scale.mutable_data<T>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(x->type());
-      auto runner_cast_scale =
+      const auto& runner_cast_scale =
           NpuOpRunner("Cast", {*scale}, {cast_scale},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_scale.Run(stream);
@@ -280,7 +280,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       cast_mean.Resize(mean->dims());
       cast_mean.mutable_data<T>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(x->type());
-      auto runner_cast_mean =
+      const auto& runner_cast_mean =
           NpuOpRunner("Cast", {*mean}, {cast_mean},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_mean.Run(stream);
@@ -295,7 +295,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       cast_variance.Resize(variance->dims());
       cast_variance.mutable_data<T>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(x->type());
-      auto runner_cast_variance =
+      const auto& runner_cast_variance =
           NpuOpRunner("Cast", {*variance}, {cast_variance},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_variance.Run(stream);
@@ -343,16 +343,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
       dbias->mutable_data<T>(ctx.GetPlace());
     }
 
-    auto runner = NpuOpRunner("LayerNormGrad",
-                              {*dy, *x, cast_variance, cast_mean, cast_scale},
-                              {*dx, *tmp_dscale, *tmp_dbias}, {});
+    const auto& runner = NpuOpRunner(
+        "LayerNormGrad", {*dy, *x, cast_variance, cast_mean, cast_scale},
+        {*dx, *tmp_dscale, *tmp_dbias}, {});
     runner.Run(stream);
 
     // cast back from FP16 to FP32
     if (x->type() == framework::proto::VarType::FP16 &&
         dscale->type() == framework::proto::VarType::FP32) {
       auto dst_dtype = ConvertToNpuDtype(dscale->type());
-      auto runner_cast_dscale =
+      const auto& runner_cast_dscale =
           NpuOpRunner("Cast", {*tmp_dscale}, {*dscale},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_dscale.Run(stream);
@@ -361,7 +361,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel<T> {
     if (x->type() == framework::proto::VarType::FP16 &&
         dbias->type() == framework::proto::VarType::FP32) {
       auto dst_dtype = ConvertToNpuDtype(dbias->type());
-      auto runner_cast_dbias =
+      const auto& runner_cast_dbias =
           NpuOpRunner("Cast", {*tmp_dbias}, {*dbias},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_dbias.Run(stream);
diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..148fb05afcfd9a4ef1fcbc587a2bd33947a41000
--- /dev/null
+++ b/paddle/fluid/operators/lgamma_op.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/lgamma_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LgammaOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of lgamma op.");
+    AddOutput("Out", "(Tensor), The output tensor of lgamma op.");
+    AddComment(R"DOC(
+Lgamma Operator.
+
+This operator performs elementwise lgamma for input $X$.
+$$out = log\Gamma(x)$$
+
+)DOC");
+  }
+};
+
+class LgammaOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma");
+
+    auto in_dims = ctx->GetInputDim("X");
+
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+template <typename T>
+class LgammaGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> retv) const override {
+    retv->SetType("lgamma_grad");
+    retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    retv->SetInput("X", this->Input("X"));
+    retv->SetAttrMap(this->Attrs());
+    retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+class LgammaGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input",
+                   "Out@Grad", "LgammaGrad");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "LgammaGrad");
+    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output",
+                   "X@Grad", "LgammaGrad");
+
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), dout_dims);
+    ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker,
+                  ops::LgammaGradMaker<paddle::framework::OpDesc>,
+                  ops::LgammaGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    lgamma, ops::LgammaKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LgammaKernel<paddle::platform::CPUDeviceContext, double>)
+
+REGISTER_OP_CPU_KERNEL(
+    lgamma_grad,
+    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LgammaGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..befd31e3bd8b1898ad6c59dca80dac3ae6de339d
--- /dev/null
+++ b/paddle/fluid/operators/lgamma_op.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
+#include "paddle/fluid/operators/lgamma_op.h"
+#include "paddle/fluid/operators/math/complex_functors.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename Enable = void>
+struct CudaLgammaFunctor;
+
+template <typename T>
+struct CudaLgammaFunctor<T, math::NoComplex<T, math::Real<T>>> {
+  __device__ __forceinline__ T operator()(const T* args) const {
+    return Eigen::numext::lgamma(args[0]);
+  }
+};
+
+template <typename T>
+class LgammaKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<math::Real<T>>(context.GetPlace());
+
+    auto& dev_ctx = context.device_context<platform::CUDADeviceContext>();
+    std::vector<const framework::Tensor*> ins = {x};
+    std::vector<framework::Tensor*> outs = {out};
+    auto functor = CudaLgammaFunctor<T>();
+    LaunchSameDimsElementwiseCudaKernel<ElementwiseType::kUnary, T,
+                                        math::Real<T>>(dev_ctx, ins, &outs,
+                                                       functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    lgamma, ops::LgammaKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LgammaKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    lgamma_grad,
+    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LgammaGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..674054e74573208ea9bbd537419d202e1a30d8c0
--- /dev/null
+++ b/paddle/fluid/operators/lgamma_op.h
@@ -0,0 +1,100 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <unsupported/Eigen/SpecialFunctions>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LgammaFunctor {
+  LgammaFunctor(const T* input, T* output, int64_t numel)
+      : input_(input), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = Eigen::numext::lgamma(input_[idx]);
+  }
+
+ private:
+  const T* input_;
+  T* output_;
+  int64_t numel_;
+};
+
+template <typename T>
+struct LgammaGradFunctor {
+  LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]);
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class LgammaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    auto numel = x->numel();
+    auto* x_data = x->data<T>();
+    auto* out_data = out->mutable_data<T>(context.GetPlace(),
+                                          size_t(x->numel() * sizeof(T)));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    LgammaFunctor<T> functor(x_data, out_data, numel);
+    for_range(functor);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LgammaGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    const framework::Tensor* d_out =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    const framework::Tensor* x = ctx.Input<framework::Tensor>("X");
+    framework::Tensor* d_x =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto numel = d_out->numel();
+    auto* dout_data = d_out->data<T>();
+    auto* x_data = x->data<T>();
+    auto* dx_data = d_x->mutable_data<T>(
+        ctx.GetPlace(), static_cast<size_t>(numel * sizeof(T)));
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    platform::ForRange<DeviceContext> for_range(dev_ctx, numel);
+    LgammaGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+    for_range(functor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index 1569512dc74f7209a4dd3921e275c02e40745535..c41805d41cef4618a3f355e04f8e156423f91b55 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -154,3 +154,8 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     log_loss_grad,
     ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/operators/log_loss_op.cu
deleted file mode 100644
index 280913c43a2749ddd5fbd3ae1905f1b823dd525d..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/log_loss_op.cu
+++ /dev/null
@@ -1,21 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/log_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h
index e62de17a98603109786e49725537867c3fe7831a..e7985ab810b138da62390fae29eb4a6cf638c897 100644
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,9 +41,8 @@ class LogLossKernel : public framework::OpKernel<T> {
     auto loss = EigenVector<T>::Flatten(*loss_out);
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
-    loss.device(place) = (-(label * (prediction + epsilon).log()) -
-                          ((static_cast<T>(1) - label) *
-                           (static_cast<T>(1) - prediction + epsilon).log()));
+    EigenLogLoss<std::decay_t<decltype(place)>, T>::Eval(
+        place, loss, prediction, label, epsilon);
   }
 };
 
@@ -64,9 +64,8 @@ class LogLossGradKernel : public framework::OpKernel<T> {
     if (dpred) {
       dpred->mutable_data<T>(ctx.GetPlace());
       auto dx = framework::EigenVector<T>::Flatten(*dpred);
-      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
-                               ((static_cast<T>(1) - label) /
-                                (static_cast<T>(1) - prediction + epsilon)));
+      EigenLogLossGrad<std::decay_t<decltype(place)>, T>::Eval(
+          place, dx, dl, prediction, label, epsilon);
     }
   }
 };
diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu
index e4fe92c625640dba38daa6690705eed2cf0032be..12c607adb44f4e9590bd5a50305c9d6fd5b3d1d7 100644
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
@@ -104,7 +104,7 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src,
 #pragma unroll
   for (int it = 0; it < warp_iter; ++it) {
     int element_index = thread_in_warp_idx + it * kernel_warp_size;
-    if (element_index < element_count) {
+    if (element_index < effective_element_count) {
       dst[batch_id * element_count + element_index] =
           static_cast<T>(elements[it] - max_value - sum);
     } else {
@@ -226,7 +226,7 @@ __global__ void ComputeLogSoftmaxBackwardInWarp(const T *output,
 #pragma unroll
   for (int iter = 0; iter < warp_iter; ++iter) {
     int element_index = thread_in_warp_idx + iter * kernel_warp_size;
-    if (element_index < element_count) {
+    if (element_index < effective_element_count) {
       grad_input[batch_id * element_count + element_index] = static_cast<T>(
           (grad_output_register[iter] - std::exp(output_register[iter]) * sum));
     }
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 2e8b551ea4e43ce4dd919b6800b9b3784b4a7aac..9a0ce3900acf1c104233aeffb2746c8b4e6f8595 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -118,6 +118,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                          ") for entry attribute.")
         .SetDefault("none");
 
+    AddAttr<std::string>("table_class",
+                         "(std::string, default "
+                         ") for table_class.")
+        .SetDefault("none");
+
     AddAttr<std::vector<std::string>>(
         "table_names",
         "(string vector, the split table names that will be fetched from "
diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 87618b954d232dcfe5d0ed0b8062db7c324c1290..b4a861ed19c1b6204c435f75cee88cd1605525da 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -41,7 +41,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
     output_t->mutable_data<T>(ctx.GetPlace());
     framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
 
-    auto runner =
+    const auto &runner =
         NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -65,14 +65,14 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner_zeros =
+    const auto &runner_zeros =
         NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
     runner_zeros.Run(stream);
 
     // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
     // can be different tensor, but in cann 20.2+, it does inplace operation.
     // Thus, the first input and output should be same tensor.
-    auto runner_scatter =
+    const auto &runner_scatter =
         NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
                     {*table_grad_t}, {{"use_locking", true}});
     runner_scatter.Run(stream);
diff --git a/paddle/fluid/operators/marker_op.cc b/paddle/fluid/operators/marker_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..397e3bfc6ad262d83f46f6751dd9372fbb20efcd
--- /dev/null
+++ b/paddle/fluid/operators/marker_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+class MarkerOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    std::string marker_role = ctx->Attrs().Get<std::string>("marker_role");
+    std::string marker_pos = ctx->Attrs().Get<std::string>("marker_pos");
+
+    VLOG(3) << "The role is:" << marker_role << ";"
+            << "The position is:" << marker_pos << ".";
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(framework::proto::VarType::FP32,
+                                   ctx.GetPlace());
+  }
+};
+
+class MarkerOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddAttr<std::string>("marker_role",
+                         "(string, default forward)forward or backward,"
+                         " mark different stages of porcess.")
+        .SetDefault("forward");
+    AddAttr<std::string>(
+        "marker_pos",
+        "(string, default B)the posititon where the marker is placed, "
+        "B stands for begin of duration,"
+        " E stands for end of duration.")
+        .SetDefault("B");
+    AddComment(
+        R"DOC(Marker Operator - Add marker at the beginning/end of a forward/backward process.)DOC");
+  }
+};
+
+template <typename T>
+class MarkerOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto marker_role = ctx.Attr<std::string>("marker_role");
+    auto marker_pos = ctx.Attr<std::string>("marker_pos");
+
+    platform::RecordEvent record_event(
+        "MarkerCPU", platform::EventRole::kInnerOp,
+        "marker_" + marker_role + "_" + marker_pos);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_WITHOUT_GRADIENT(marker, ops::MarkerOp, ops::MarkerOpMaker);
+REGISTER_OP_CPU_KERNEL(marker, ops::MarkerOpCPUKernel<float>);
diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b918210389169ab2f85f1a8bcd244e59a480281a
--- /dev/null
+++ b/paddle/fluid/operators/marker_op.cu
@@ -0,0 +1,61 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void SimpleMarkerKernel(T* in, T* out, int ndim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  for (; idx < ndim; idx += blockDim.x * gridDim.x) {
+    out[idx] = in[idx];
+  }
+}
+
+template <typename T>
+class MarkerOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    auto marker_role = ctx.Attr<std::string>("marker_role");
+    auto marker_pos = ctx.Attr<std::string>("marker_pos");
+    VLOG(3) << "marker role: " << marker_role
+            << " marker position: " << marker_pos;
+
+    framework::Tensor A;
+    framework::Tensor B;
+    auto* in_temp = A.mutable_data<T>({32, 1}, ctx.GetPlace());
+    auto* out_temp = B.mutable_data<T>({32, 1}, ctx.GetPlace());
+    platform::RecordEvent record_event(
+        "MarkerCUDA", platform::EventRole::kInnerOp,
+        "marker_" + marker_role + "_" + marker_pos);
+    SimpleMarkerKernel<T><<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp,
+                                                          32);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(marker, ops::MarkerOpCUDAKernel<float>);
diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc
index 3b44c02757fae9648a7e660a06c03af45d621e02..17bf5df18adc543ea487160a31d05d3c802b95a7 100644
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
@@ -26,8 +26,9 @@ class MaskedSelectOp : public framework::OperatorWithKernel {
     OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect");
     OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect");
     OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect");
-    framework::DDim output_dims(ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Y", output_dims);
+
+    // output will only be a 1-D Tensor
+    ctx->SetOutputDim("Y", framework::make_ddim({-1}));
     ctx->ShareLoD("X", /*->*/ "Y");
   }
 
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu
index 512f9c62415e5d1b09a1b649e78c72ac2d9f2d88..4d7218cd89e04b5122ff4385abfb2c7305e40c0a 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <algorithm>
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
@@ -311,6 +312,156 @@ __global__ void SoftmaxKernelWithEltadd2<half2>(
 #endif
 }
 
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk,
+                                                const int batch_size,
+                                                const int head_num,
+                                                const int seq_len,
+                                                const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  T stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_max = qk_buf[threadIdx.x + i + qk_offset] +
+                             bias_qk[threadIdx.x + i + qk_offset] >
+                         stride_max
+                     ? qk_buf[threadIdx.x + i + qk_offset] +
+                           bias_qk[threadIdx.x + i + qk_offset]
+                     : stride_max;
+  }
+  T max_val = blockReduceMax<T>(stride_max, mask);
+
+  T stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] +
+                         bias_qk[threadIdx.x + i + qk_offset] - max_val);
+  }
+  T sum_val = blockReduceSum<T>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    qk_buf[threadIdx.x + i + qk_offset] =
+        (T)(__expf(qk_buf[threadIdx.x + i + qk_offset] +
+                   bias_qk[threadIdx.x + i + qk_offset] - max_val) /
+            sum_val);
+  }
+}
+
+// HIP defined __HIP_NO_HALF_CONVERSIONS__
+#ifndef __HIPCC__  // @{ Half kernel: SoftmaxKernelWithEltadd
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge(
+    half *qk_buf, const half *bias_qk, const int batch_size, const int head_num,
+    const int seq_len, const unsigned mask) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float stride_max = -1e20f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_max = tmp > stride_max ? tmp : stride_max;
+  }
+  float max_val = blockReduceMax<float>(stride_max, mask);
+
+  float stride_sum = 0.f;
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp = static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                   bias_qk[threadIdx.x + i + qk_offset]);
+    stride_sum += __expf(tmp - max_val);
+  }
+  float sum_val = blockReduceSum<float>(stride_sum, mask);
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float tmp =
+        __expf(static_cast<float>(qk_buf[threadIdx.x + i + qk_offset] +
+                                  bias_qk[threadIdx.x + i + qk_offset]) -
+               max_val);
+    qk_buf[threadIdx.x + i + qk_offset] = (half)(tmp / sum_val);
+  }
+#endif
+}
+#endif  // @} End Half kernel: SoftmaxKernelWithEltadd
+
+template <typename T>
+__global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_,
+                                                 const int batch_size,
+                                                 const int head_num,
+                                                 const int seq_len,
+                                                 const unsigned mask) {
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<T>(qk_buf_[threadIdx.x + i + qk_offset] +
+                             bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<T>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+}
+
+template <>
+__global__ void SoftmaxKernelWithEltaddForLarge2(
+    half2 *qk_buf_, const half2 *bias_qk_, const int batch_size,
+    const int head_num, const int seq_len, const unsigned mask) {
+// operator "+" of half only suppotted after cuda version 10.0
+// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
+#if defined(PADDLE_WITH_CUDA) && \
+    (CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000)
+
+  int qk_offset = blockIdx.x * seq_len;
+  assert(blockDim.x % 32 == 0);
+
+  float2 stride_max = make_float2(-1e20f, -1e20f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_max.x = max(stride_max.x, cur.x);
+    stride_max.y = max(stride_max.y, cur.y);
+  }
+  float max_val = blockReduceMax<float>(max(stride_max.x, stride_max.y), mask);
+
+  float2 stride_sum = make_float2(0.f, 0.f);
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    stride_sum.x += __expf(cur.x - max_val);
+    stride_sum.y += __expf(cur.y - max_val);
+  }
+
+  float sum_val =
+      blockReduceSum<float>(stride_sum.x + stride_sum.y, mask) + 1e-6f;
+
+  for (int i = 0; i < seq_len; i += blockDim.x) {
+    float2 cur = ToFloat2<half2>(qk_buf_[threadIdx.x + i + qk_offset] +
+                                 bias_qk_[threadIdx.x + i + qk_offset]);
+    qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair<half2>(
+        __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val);
+  }
+#endif
+}
+
 template <typename T>
 inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
                              int head_num, int seq_len, int size_per_head,
@@ -332,31 +483,48 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
       reinterpret_cast<run_type *>(qk_buf_), batch_size * head_num,
       seq_len * size_per_head, seq_len * size_per_head);
 
-  int grid = batch_size * head_num * seq_len;
-  int block = seq_len;
-
-  // Align block to 32, also limit seq_len to max block size.
-  PADDLE_ENFORCE_LE(seq_len, 1024, platform::errors::InvalidArgument(
-                                       "seq_len should <= 1024, "
-                                       "but received seq_len is:%d",
-                                       seq_len));
-  if (seq_len % 2 == 0) {
-    block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
-    if (std::is_same<T, float>::value) {
-      SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<float2 *>(qk_buf_),
-          reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+  if (seq_len <= 1024) {
+    int grid = batch_size * head_num * seq_len;
+    int block = seq_len;
+
+    // Align block to 32, also limit seq_len to max block size.
+    if (seq_len % 2 == 0) {
+      block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32;
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltadd2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
     } else {
-      SoftmaxKernelWithEltadd2<__half2><<<grid, block, 0, stream>>>(
-          reinterpret_cast<__half2 *>(qk_buf_),
-          reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
-          seq_len / 2, FINAL_MASK);
+      block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
+      SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
     }
   } else {
-    block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32;
-    SoftmaxKernelWithEltadd<T><<<grid, block, 0, stream>>>(
-        qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    int grid = batch_size * head_num * seq_len;
+    int block = 512;
+    if (seq_len % 2 == 0) {
+      if (std::is_same<T, float>::value) {
+        SoftmaxKernelWithEltaddForLarge2<float2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<float2 *>(qk_buf_),
+            reinterpret_cast<const float2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      } else {
+        SoftmaxKernelWithEltaddForLarge2<__half2><<<grid, block, 0, stream>>>(
+            reinterpret_cast<__half2 *>(qk_buf_),
+            reinterpret_cast<const __half2 *>(bias_qk), batch_size, head_num,
+            seq_len / 2, FINAL_MASK);
+      }
+    } else {
+      SoftmaxKernelWithEltaddForLarge<T><<<grid, block, 0, stream>>>(
+          qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index c44c15adb13caf9be401c3174e68e229d1eea745..477f3e0f6a2dc5cfd6fcc0b0624f8f0c2563fe8b 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -260,13 +260,13 @@ struct CUBlas<platform::float16> {
 };
 
 template <>
-struct CUBlas<platform::complex64> {
-  using complex64 = platform::complex64;
-
+struct CUBlas<platform::complex<float>> {
   static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m,
-                   int n, const complex64 *alpha, const complex64 *A, int lda,
-                   const complex64 *B, int ldb, const complex64 *beta,
-                   complex64 *C, int ldc) {
+                   int n, const platform::complex<float> *alpha,
+                   const platform::complex<float> *A, int lda,
+                   const platform::complex<float> *B, int ldb,
+                   const platform::complex<float> *beta,
+                   platform::complex<float> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv(
         handle, transa, m, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(A), lda,
@@ -275,9 +275,10 @@ struct CUBlas<platform::complex64> {
         reinterpret_cast<cuFloatComplex *>(C), ldc));
   }
 
-  static void AXPY(cublasHandle_t handle, int n, const complex64 *alpha,
-                   const complex64 *X, const int incX, complex64 *Y,
-                   const int incY) {
+  static void AXPY(cublasHandle_t handle, int n,
+                   const platform::complex<float> *alpha,
+                   const platform::complex<float> *X, const int incX,
+                   platform::complex<float> *Y, const int incY) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
         handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
         reinterpret_cast<const cuFloatComplex *>(X), incX,
@@ -287,11 +288,13 @@ struct CUBlas<platform::complex64> {
   static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
                                  cublasOperation_t transa,
                                  cublasOperation_t transb, int m, int n, int k,
-                                 const complex64 *alpha, const complex64 *A,
-                                 int lda, long long int strideA,  // NOLINT
-                                 const complex64 *B,              // NOLINT
-                                 int ldb, long long int strideB,  // NOLINT
-                                 const complex64 *beta, complex64 *C, int ldc,
+                                 const platform::complex<float> *alpha,
+                                 const platform::complex<float> *A, int lda,
+                                 long long int strideA,              // NOLINT
+                                 const platform::complex<float> *B,  // NOLINT
+                                 int ldb, long long int strideB,     // NOLINT
+                                 const platform::complex<float> *beta,
+                                 platform::complex<float> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
@@ -310,9 +313,11 @@ struct CUBlas<platform::complex64> {
 
   static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
                    cublasOperation_t transb, int m, int n, int k,
-                   const complex64 *alpha, const complex64 *A, int lda,
-                   const complex64 *B, int ldb, const complex64 *beta,
-                   complex64 *C, int ldc) {
+                   const platform::complex<float> *alpha,
+                   const platform::complex<float> *A, int lda,
+                   const platform::complex<float> *B, int ldb,
+                   const platform::complex<float> *beta,
+                   platform::complex<float> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuFloatComplex *>(alpha),
@@ -356,13 +361,13 @@ struct CUBlas<platform::complex64> {
 };
 
 template <>
-struct CUBlas<platform::complex128> {
-  using complex128 = platform::complex128;
-
+struct CUBlas<platform::complex<double>> {
   static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m,
-                   int n, const complex128 *alpha, const complex128 *A, int lda,
-                   const complex128 *B, int ldb, const complex128 *beta,
-                   complex128 *C, int ldc) {
+                   int n, const platform::complex<double> *alpha,
+                   const platform::complex<double> *A, int lda,
+                   const platform::complex<double> *B, int ldb,
+                   const platform::complex<double> *beta,
+                   platform::complex<double> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv(
         handle, transa, m, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(A), lda,
@@ -371,9 +376,10 @@ struct CUBlas<platform::complex128> {
         reinterpret_cast<cuDoubleComplex *>(C), ldc));
   }
 
-  static void AXPY(cublasHandle_t handle, int n, const complex128 *alpha,
-                   const complex128 *X, const int incX, complex128 *Y,
-                   const int incY) {
+  static void AXPY(cublasHandle_t handle, int n,
+                   const platform::complex<double> *alpha,
+                   const platform::complex<double> *X, const int incX,
+                   platform::complex<double> *Y, const int incY) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
         handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
         reinterpret_cast<const cuDoubleComplex *>(X), incX,
@@ -383,11 +389,13 @@ struct CUBlas<platform::complex128> {
   static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
                                  cublasOperation_t transa,
                                  cublasOperation_t transb, int m, int n, int k,
-                                 const complex128 *alpha, const complex128 *A,
-                                 int lda, long long int strideA,  // NOLINT
-                                 const complex128 *B,             // NOLINT
-                                 int ldb, long long int strideB,  // NOLINT
-                                 const complex128 *beta, complex128 *C, int ldc,
+                                 const platform::complex<double> *alpha,
+                                 const platform::complex<double> *A, int lda,
+                                 long long int strideA,               // NOLINT
+                                 const platform::complex<double> *B,  // NOLINT
+                                 int ldb, long long int strideB,      // NOLINT
+                                 const platform::complex<double> *beta,
+                                 platform::complex<double> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
 #if CUDA_VERSION >= 8000
@@ -406,9 +414,11 @@ struct CUBlas<platform::complex128> {
 
   static void GEMM(cublasHandle_t handle, cublasOperation_t transa,
                    cublasOperation_t transb, int m, int n, int k,
-                   const complex128 *alpha, const complex128 *A, int lda,
-                   const complex128 *B, int ldb, const complex128 *beta,
-                   complex128 *C, int ldc) {
+                   const platform::complex<double> *alpha,
+                   const platform::complex<double> *A, int lda,
+                   const platform::complex<double> *B, int ldb,
+                   const platform::complex<double> *beta,
+                   platform::complex<double> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const cuDoubleComplex *>(alpha),
@@ -535,9 +545,9 @@ template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    platform::complex64 alpha, const platform::complex64 *A,
-    const platform::complex64 *B, platform::complex64 beta,
-    platform::complex64 *C) const {
+    platform::complex<float> alpha, const platform::complex<float> *A,
+    const platform::complex<float> *B, platform::complex<float> beta,
+    platform::complex<float> *C) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -565,16 +575,16 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   // input/output in fp16, computation in fp32, which can also be accelerated
   // using tensor cores in volta GPUs.
   auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-  CUBlas<platform::complex64>::GEMM_EX(
+  CUBlas<platform::complex<float>>::GEMM_EX(
       &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A,
       CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
   context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<platform::complex64>::GEMM(handle, cuTransB, cuTransA, N, M, K,
-                                      &c_alpha, h_B, ldb, h_A, lda, &c_beta,
-                                      h_C, N);
+    CUBlas<platform::complex<float>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                           &c_alpha, h_B, ldb, h_A, lda,
+                                           &c_beta, h_C, N);
   });
 #endif  // CUDA_VERSION >= 8000
 }
@@ -583,9 +593,9 @@ template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    platform::complex128 alpha, const platform::complex128 *A,
-    const platform::complex128 *B, platform::complex128 beta,
-    platform::complex128 *C) const {
+    platform::complex<double> alpha, const platform::complex<double> *A,
+    const platform::complex<double> *B, platform::complex<double> beta,
+    platform::complex<double> *C) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -614,16 +624,16 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   // input/output in fp16, computation in fp32, which can also be accelerated
   // using tensor cores in volta GPUs.
   auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-  CUBlas<platform::complex128>::GEMM_EX(
+  CUBlas<platform::complex<double>>::GEMM_EX(
       &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A,
       CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F);
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
   context_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<platform::complex128>::GEMM(handle, cuTransB, cuTransA, N, M, K,
-                                       &c_alpha, h_B, ldb, h_A, lda, &c_beta,
-                                       h_C, N);
+    CUBlas<platform::complex<double>>::GEMM(handle, cuTransB, cuTransA, N, M, K,
+                                            &c_alpha, h_B, ldb, h_A, lda,
+                                            &c_beta, h_C, N);
   });
 #endif  // CUDA_VERSION >= 8000
 }
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 05d42f02c1003af2d1efd1e642860ae7e5b5ba01..eab513e24bc8090d30a42cd1149c6bf65d690839 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -23,8 +23,7 @@
 
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace paddle {
 namespace operators {
@@ -324,11 +323,11 @@ struct CBlas<double> {
 };
 
 template <>
-struct CBlas<platform::complex64> {
+struct CBlas<platform::complex<float>> {
   template <typename... ARGS>
-  static void AXPY(int n, const paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *X, const int incX,
-                   paddle::platform::complex64 *Y, const int incY) {
+  static void AXPY(int n, const paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *X, const int incX,
+                   paddle::platform::complex<float> *Y, const int incY) {
     platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
   }
 
@@ -363,35 +362,35 @@ struct CBlas<platform::complex64> {
   */
 
   template <typename... ARGS>
-  static void VADD(int n, const paddle::platform::complex64 *a,
-                   const paddle::platform::complex64 *b,
-                   paddle::platform::complex64 *y) {
+  static void VADD(int n, const paddle::platform::complex<float> *a,
+                   const paddle::platform::complex<float> *b,
+                   paddle::platform::complex<float> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] + b[i];
     }
   }
 
   template <typename... ARGS>
-  static void VSUB(int n, const paddle::platform::complex64 *a,
-                   const paddle::platform::complex64 *b,
-                   paddle::platform::complex64 *y) {
+  static void VSUB(int n, const paddle::platform::complex<float> *a,
+                   const paddle::platform::complex<float> *b,
+                   paddle::platform::complex<float> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] - b[i];
     }
   }
 
   template <typename... ARGS>
-  static void VMUL(int n, const paddle::platform::complex64 *a,
-                   const paddle::platform::complex64 *b,
-                   paddle::platform::complex64 *y) {
+  static void VMUL(int n, const paddle::platform::complex<float> *a,
+                   const paddle::platform::complex<float> *b,
+                   paddle::platform::complex<float> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] * b[i];
     }
   }
   template <typename... ARGS>
-  static void VDIV(int n, const paddle::platform::complex64 *a,
-                   const paddle::platform::complex64 *b,
-                   paddle::platform::complex64 *y) {
+  static void VDIV(int n, const paddle::platform::complex<float> *a,
+                   const paddle::platform::complex<float> *b,
+                   paddle::platform::complex<float> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] / b[i];
     }
@@ -399,11 +398,11 @@ struct CBlas<platform::complex64> {
 
   template <typename... ARGS>
   static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N,
-                   paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *A, int lda,
-                   const paddle::platform::complex64 *X, int incx,
-                   paddle::platform::complex64 beta,
-                   paddle::platform::complex64 *Y, int incy) {
+                   paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, int lda,
+                   const paddle::platform::complex<float> *X, int incx,
+                   paddle::platform::complex<float> beta,
+                   paddle::platform::complex<float> *Y, int incy) {
     const void *a_ = (const void *)(A);
     const void *x_ = (const void *)(X);
     void *y_ = static_cast<void *>(Y);
@@ -414,11 +413,11 @@ struct CBlas<platform::complex64> {
   template <typename... ARGS>
   static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a,
                    CBLAS_TRANSPOSE trans_b, int M, int N, int K,
-                   paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *A, int lda,
-                   const paddle::platform::complex64 *B, int ldb,
-                   paddle::platform::complex64 beta,
-                   paddle::platform::complex64 *C, int ldc) {
+                   paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, int lda,
+                   const paddle::platform::complex<float> *B, int ldb,
+                   paddle::platform::complex<float> beta,
+                   paddle::platform::complex<float> *C, int ldc) {
     const void *a_ = (const void *)(A);
     const void *b_ = (const void *)(B);
     void *c_ = static_cast<void *>(C);
@@ -429,11 +428,12 @@ struct CBlas<platform::complex64> {
   template <typename... ARGS>
   static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a,
                          CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K,
-                         paddle::platform::complex64 *alpha,
-                         const paddle::platform::complex64 **A, const int *lda,
-                         const paddle::platform::complex64 **B, const int *ldb,
-                         paddle::platform::complex64 *beta,
-                         paddle::platform::complex64 **C, const int *ldc,
+                         paddle::platform::complex<float> *alpha,
+                         const paddle::platform::complex<float> **A,
+                         const int *lda,
+                         const paddle::platform::complex<float> **B,
+                         const int *ldb, paddle::platform::complex<float> *beta,
+                         paddle::platform::complex<float> **C, const int *ldc,
                          int group_count, int *group_size) {
     const void **A_void = (const void **)(&(*A));
     const void **B_void = (const void **)(&(*B));
@@ -451,11 +451,11 @@ struct CBlas<platform::complex64> {
 };
 
 template <>
-struct CBlas<platform::complex128> {
+struct CBlas<platform::complex<double>> {
   template <typename... ARGS>
-  static void AXPY(int n, const paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *X, const int incX,
-                   paddle::platform::complex128 *Y, const int incY) {
+  static void AXPY(int n, const paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *X, const int incX,
+                   paddle::platform::complex<double> *Y, const int incY) {
     platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
   }
 
@@ -490,35 +490,35 @@ struct CBlas<platform::complex128> {
   */
 
   template <typename... ARGS>
-  static void VADD(int n, const paddle::platform::complex128 *a,
-                   const paddle::platform::complex128 *b,
-                   paddle::platform::complex128 *y) {
+  static void VADD(int n, const paddle::platform::complex<double> *a,
+                   const paddle::platform::complex<double> *b,
+                   paddle::platform::complex<double> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] + b[i];
     }
   }
 
   template <typename... ARGS>
-  static void VSUB(int n, const paddle::platform::complex128 *a,
-                   const paddle::platform::complex128 *b,
-                   paddle::platform::complex128 *y) {
+  static void VSUB(int n, const paddle::platform::complex<double> *a,
+                   const paddle::platform::complex<double> *b,
+                   paddle::platform::complex<double> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] - b[i];
     }
   }
 
   template <typename... ARGS>
-  static void VMUL(int n, const paddle::platform::complex128 *a,
-                   const paddle::platform::complex128 *b,
-                   paddle::platform::complex128 *y) {
+  static void VMUL(int n, const paddle::platform::complex<double> *a,
+                   const paddle::platform::complex<double> *b,
+                   paddle::platform::complex<double> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] * b[i];
     }
   }
   template <typename... ARGS>
-  static void VDIV(int n, const paddle::platform::complex128 *a,
-                   const paddle::platform::complex128 *b,
-                   paddle::platform::complex128 *y) {
+  static void VDIV(int n, const paddle::platform::complex<double> *a,
+                   const paddle::platform::complex<double> *b,
+                   paddle::platform::complex<double> *y) {
     for (int i = 0; i < n; ++i) {
       y[i] = a[i] / b[i];
     }
@@ -526,11 +526,11 @@ struct CBlas<platform::complex128> {
 
   template <typename... ARGS>
   static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N,
-                   paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *A, int lda,
-                   const paddle::platform::complex128 *X, int incx,
-                   paddle::platform::complex128 beta,
-                   paddle::platform::complex128 *Y, int incy) {
+                   paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, int lda,
+                   const paddle::platform::complex<double> *X, int incx,
+                   paddle::platform::complex<double> beta,
+                   paddle::platform::complex<double> *Y, int incy) {
     const void *a_ = (const void *)(A);
     const void *x_ = (const void *)(X);
     void *y_ = static_cast<void *>(Y);
@@ -541,11 +541,11 @@ struct CBlas<platform::complex128> {
   template <typename... ARGS>
   static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a,
                    CBLAS_TRANSPOSE trans_b, int M, int N, int K,
-                   paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *A, int lda,
-                   const paddle::platform::complex128 *B, int ldb,
-                   paddle::platform::complex128 beta,
-                   paddle::platform::complex128 *C, int ldc) {
+                   paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, int lda,
+                   const paddle::platform::complex<double> *B, int ldb,
+                   paddle::platform::complex<double> beta,
+                   paddle::platform::complex<double> *C, int ldc) {
     const void *a_ = (const void *)(A);
     const void *b_ = (const void *)(B);
     void *c_ = static_cast<void *>(C);
@@ -556,11 +556,13 @@ struct CBlas<platform::complex128> {
   template <typename... ARGS>
   static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a,
                          CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K,
-                         paddle::platform::complex128 *alpha,
-                         const paddle::platform::complex128 **A, const int *lda,
-                         const paddle::platform::complex128 **B, const int *ldb,
-                         paddle::platform::complex128 *beta,
-                         paddle::platform::complex128 **C, const int *ldc,
+                         paddle::platform::complex<double> *alpha,
+                         const paddle::platform::complex<double> **A,
+                         const int *lda,
+                         const paddle::platform::complex<double> **B,
+                         const int *ldb,
+                         paddle::platform::complex<double> *beta,
+                         paddle::platform::complex<double> **C, const int *ldc,
                          int group_count, int *group_size) {
     const void **A_void = (const void **)(&(*A));
     const void **B_void = (const void **)(&(*B));
@@ -636,76 +638,76 @@ struct CBlas<double> {
 };
 
 template <>
-struct CBlas<platform::complex64> {
+struct CBlas<platform::complex<float>> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     cblas_ccopy(args...);
   }
 
   template <typename... ARGS>
-  static void AXPY(int n, const paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *X, const int incX,
-                   paddle::platform::complex64 *Y, const int incY) {
+  static void AXPY(int n, const paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *X, const int incX,
+                   paddle::platform::complex<float> *Y, const int incY) {
     cblas_caxpy(n, &alpha, X, incX, Y, incY);
   }
 
   template <typename... ARGS>
   static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
                    const int M, const int N,
-                   const paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *A, const int lda,
-                   const paddle::platform::complex64 *X, const int incX,
-                   const paddle::platform::complex64 beta,
-                   paddle::platform::complex64 *Y, const int incY) {
+                   const paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, const int lda,
+                   const paddle::platform::complex<float> *X, const int incX,
+                   const paddle::platform::complex<float> beta,
+                   paddle::platform::complex<float> *Y, const int incY) {
     cblas_cgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
   }
 
   template <typename... ARGS>
   static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                   const int K, const paddle::platform::complex64 alpha,
-                   const paddle::platform::complex64 *A, const int lda,
-                   const paddle::platform::complex64 *B, const int ldb,
-                   const paddle::platform::complex64 beta,
-                   paddle::platform::complex64 *C, const int ldc) {
+                   const int K, const paddle::platform::complex<float> alpha,
+                   const paddle::platform::complex<float> *A, const int lda,
+                   const paddle::platform::complex<float> *B, const int ldb,
+                   const paddle::platform::complex<float> beta,
+                   paddle::platform::complex<float> *C, const int ldc) {
     cblas_cgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta,
                 C, ldc);
   }
 };
 
 template <>
-struct CBlas<platform::complex128> {
+struct CBlas<platform::complex<double>> {
   template <typename... ARGS>
   static void VCOPY(ARGS... args) {
     cblas_zcopy(args...);
   }
 
   template <typename... ARGS>
-  static void AXPY(int n, const paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *X, const int incX,
-                   paddle::platform::complex128 *Y, const int incY) {
+  static void AXPY(int n, const paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *X, const int incX,
+                   paddle::platform::complex<double> *Y, const int incY) {
     cblas_zaxpy(n, &alpha, X, incX, Y, incY);
   }
 
   template <typename... ARGS>
   static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
                    const int M, const int N,
-                   const paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *A, const int lda,
-                   const paddle::platform::complex128 *X, const int incX,
-                   const paddle::platform::complex128 beta,
-                   paddle::platform::complex128 *Y, const int incY) {
+                   const paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, const int lda,
+                   const paddle::platform::complex<double> *X, const int incX,
+                   const paddle::platform::complex<double> beta,
+                   paddle::platform::complex<double> *Y, const int incY) {
     cblas_zgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
   }
 
   template <typename... ARGS>
   static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
                    const CBLAS_TRANSPOSE TransB, const int M, const int N,
-                   const int K, const paddle::platform::complex128 alpha,
-                   const paddle::platform::complex128 *A, const int lda,
-                   const paddle::platform::complex128 *B, const int ldb,
-                   const paddle::platform::complex128 beta,
-                   paddle::platform::complex128 *C, const int ldc) {
+                   const int K, const paddle::platform::complex<double> alpha,
+                   const paddle::platform::complex<double> *A, const int lda,
+                   const paddle::platform::complex<double> *B, const int ldb,
+                   const paddle::platform::complex<double> beta,
+                   paddle::platform::complex<double> *C, const int ldc) {
     cblas_zgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta,
                 C, ldc);
   }
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 81110b591a1cbb3dd60a618b329b70e71b4912fe..788ebc6ad985c5fb6e6667220713783f014d2a62 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -213,13 +213,13 @@ struct CUBlas<platform::float16> {
 };
 
 template <>
-struct CUBlas<platform::complex64> {
-  using complex64 = platform::complex64;
-
+struct CUBlas<platform::complex<float>> {
   static void GEMV(rocblas_handle handle, rocblas_operation transa, int m,
-                   int n, const complex64 *alpha, const complex64 *A, int lda,
-                   const complex64 *B, int ldb, const complex64 *beta,
-                   complex64 *C, int ldc) {
+                   int n, const platform::complex<float> *alpha,
+                   const platform::complex<float> *A, int lda,
+                   const platform::complex<float> *B, int ldb,
+                   const platform::complex<float> *beta,
+                   platform::complex<float> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
@@ -229,9 +229,10 @@ struct CUBlas<platform::complex64> {
         reinterpret_cast<rocblas_float_complex *>(C), ldc));
   }
 
-  static void AXPY(rocblas_handle handle, int n, const complex64 *alpha,
-                   const complex64 *X, const int incX, complex64 *Y,
-                   const int incY) {
+  static void AXPY(rocblas_handle handle, int n,
+                   const platform::complex<float> *alpha,
+                   const platform::complex<float> *X, const int incX,
+                   platform::complex<float> *Y, const int incY) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_caxpy(
         handle, n, reinterpret_cast<const rocblas_float_complex *>(alpha),
         reinterpret_cast<const rocblas_float_complex *>(X), incX,
@@ -241,11 +242,13 @@ struct CUBlas<platform::complex64> {
   static void GEMM_STRIDED_BATCH(rocblas_handle handle,
                                  rocblas_operation transa,
                                  rocblas_operation transb, int m, int n, int k,
-                                 const complex64 *alpha, const complex64 *A,
-                                 int lda, long long int strideA,  // NOLINT
-                                 const complex64 *B,              // NOLINT
-                                 int ldb, long long int strideB,  // NOLINT
-                                 const complex64 *beta, complex64 *C, int ldc,
+                                 const platform::complex<float> *alpha,
+                                 const platform::complex<float> *A, int lda,
+                                 long long int strideA,              // NOLINT
+                                 const platform::complex<float> *B,  // NOLINT
+                                 int ldb, long long int strideB,     // NOLINT
+                                 const platform::complex<float> *beta,
+                                 platform::complex<float> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -261,9 +264,11 @@ struct CUBlas<platform::complex64> {
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
                    rocblas_operation transb, int m, int n, int k,
-                   const complex64 *alpha, const complex64 *A, int lda,
-                   const complex64 *B, int ldb, const complex64 *beta,
-                   complex64 *C, int ldc) {
+                   const platform::complex<float> *alpha,
+                   const platform::complex<float> *A, int lda,
+                   const platform::complex<float> *B, int ldb,
+                   const platform::complex<float> *beta,
+                   platform::complex<float> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_float_complex *>(alpha),
@@ -293,13 +298,13 @@ struct CUBlas<platform::complex64> {
 };
 
 template <>
-struct CUBlas<platform::complex128> {
-  using complex128 = platform::complex128;
-
+struct CUBlas<platform::complex<double>> {
   static void GEMV(rocblas_handle handle, rocblas_operation transa, int m,
-                   int n, const complex128 *alpha, const complex128 *A, int lda,
-                   const complex128 *B, int ldb, const complex128 *beta,
-                   complex128 *C, int ldc) {
+                   int n, const platform::complex<double> *alpha,
+                   const platform::complex<double> *A, int lda,
+                   const platform::complex<double> *B, int ldb,
+                   const platform::complex<double> *beta,
+                   platform::complex<double> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemv(
         handle, transa, m, n,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
@@ -309,9 +314,10 @@ struct CUBlas<platform::complex128> {
         reinterpret_cast<rocblas_double_complex *>(C), ldc));
   }
 
-  static void AXPY(rocblas_handle handle, int n, const complex128 *alpha,
-                   const complex128 *X, const int incX, complex128 *Y,
-                   const int incY) {
+  static void AXPY(rocblas_handle handle, int n,
+                   const platform::complex<double> *alpha,
+                   const platform::complex<double> *X, const int incX,
+                   platform::complex<double> *Y, const int incY) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zaxpy(
         handle, n, reinterpret_cast<const rocblas_double_complex *>(alpha),
         reinterpret_cast<const rocblas_double_complex *>(X), incX,
@@ -321,11 +327,13 @@ struct CUBlas<platform::complex128> {
   static void GEMM_STRIDED_BATCH(rocblas_handle handle,
                                  rocblas_operation transa,
                                  rocblas_operation transb, int m, int n, int k,
-                                 const complex128 *alpha, const complex128 *A,
-                                 int lda, long long int strideA,  // NOLINT
-                                 const complex128 *B,             // NOLINT
-                                 int ldb, long long int strideB,  // NOLINT
-                                 const complex128 *beta, complex128 *C, int ldc,
+                                 const platform::complex<double> *alpha,
+                                 const platform::complex<double> *A, int lda,
+                                 long long int strideA,               // NOLINT
+                                 const platform::complex<double> *B,  // NOLINT
+                                 int ldb, long long int strideB,      // NOLINT
+                                 const platform::complex<double> *beta,
+                                 platform::complex<double> *C, int ldc,
                                  long long int strideC,  // NOLINT
                                  int batchCount) {
     PADDLE_ENFORCE_CUDA_SUCCESS(
@@ -341,9 +349,11 @@ struct CUBlas<platform::complex128> {
 
   static void GEMM(rocblas_handle handle, rocblas_operation transa,
                    rocblas_operation transb, int m, int n, int k,
-                   const complex128 *alpha, const complex128 *A, int lda,
-                   const complex128 *B, int ldb, const complex128 *beta,
-                   complex128 *C, int ldc) {
+                   const platform::complex<double> *alpha,
+                   const platform::complex<double> *A, int lda,
+                   const platform::complex<double> *B, int ldb,
+                   const platform::complex<double> *beta,
+                   platform::complex<double> *C, int ldc) {
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemm(
         handle, transa, transb, m, n, k,
         reinterpret_cast<const rocblas_double_complex *>(alpha),
@@ -434,9 +444,9 @@ template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    platform::complex64 alpha, const platform::complex64 *A,
-    const platform::complex64 *B, platform::complex64 beta,
-    platform::complex64 *C) const {
+    platform::complex<float> alpha, const platform::complex<float> *A,
+    const platform::complex<float> *B, platform::complex<float> beta,
+    platform::complex<float> *C) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -461,7 +471,7 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
 
   auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-  CUBlas<platform::complex64>::GEMM_EX(
+  CUBlas<platform::complex<float>>::GEMM_EX(
       &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
       rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C,
       rocblas_datatype_f32_c, N, rocblas_datatype_f32_c);
@@ -471,9 +481,9 @@ template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    platform::complex128 alpha, const platform::complex128 *A,
-    const platform::complex128 *B, platform::complex128 beta,
-    platform::complex128 *C) const {
+    platform::complex<double> alpha, const platform::complex<double> *A,
+    const platform::complex<double> *B, platform::complex<double> beta,
+    platform::complex<double> *C) const {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -499,7 +509,7 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
       thrust::complex<double>(beta.real, beta.imag);
 
   auto &cuda_ctx = const_cast<platform::CUDADeviceContext &>(context_);
-  CUBlas<platform::complex128>::GEMM_EX(
+  CUBlas<platform::complex<double>>::GEMM_EX(
       &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B,
       rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C,
       rocblas_datatype_f64_c, N, rocblas_datatype_f64_c);
diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h
index f530256677854860fd7d3c6163a142ad8ba2da42..c4bd6ec4f14a27c76e3ae9f977625f312600065b 100644
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
@@ -64,9 +64,7 @@ using select_t = typename select<Head, Tail...>::type;
 
 template <typename T>
 using Real =
-    select_t<cond<std::is_same<T, platform::complex64>::value, float>,
-             cond<std::is_same<T, platform::complex128>::value, double>,
-             cond<std::is_same<T, platform::complex<float>>::value, float>,
+    select_t<cond<std::is_same<T, platform::complex<float>>::value, float>,
              cond<std::is_same<T, platform::complex<double>>::value, double>,
              T>;
 
@@ -79,15 +77,11 @@ using NoComplex = typename std::enable_if<std::is_same<T, RealT>::value>::type;
 
 template <typename T>
 using EnableComplex = typename std::enable_if<
-    std::is_same<T, platform::complex64>::value ||
-    std::is_same<T, platform::complex128>::value ||
     std::is_same<T, platform::complex<float>>::value ||
     std::is_same<T, platform::complex<double>>::value>::type;
 
 template <typename T>
 using DisableComplex = typename std::enable_if<
-    !std::is_same<T, platform::complex64>::value &&
-    !std::is_same<T, platform::complex128>::value &&
     !std::is_same<T, platform::complex<float>>::value &&
     !std::is_same<T, platform::complex<double>>::value>::type;
 
diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h
index a79a9da0b30f2be8f591e584bc56740e38b34594..65d2ca79e60c2ec90d879ce9818c398adc93c73c 100644
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
@@ -65,18 +65,16 @@ class SplitFunctor {
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_ALL_TYPES(macro)                  \
-  macro(int);                                 \
-  macro(float);                               \
-  macro(double);                              \
-  macro(bool);                                \
-  macro(int64_t);                             \
-  macro(int16_t);                             \
-  macro(uint8_t);                             \
-  macro(int8_t);                              \
-  macro(::paddle::platform::float16);         \
-  macro(::paddle::platform::bfloat16);        \
-  macro(::paddle::platform::complex<float>);  \
-  macro(::paddle::platform::complex<double>); \
-  macro(::paddle::platform::complex64);       \
-  macro(::paddle::platform::complex128)
+#define FOR_ALL_TYPES(macro)                 \
+  macro(int);                                \
+  macro(float);                              \
+  macro(double);                             \
+  macro(bool);                               \
+  macro(int64_t);                            \
+  macro(int16_t);                            \
+  macro(uint8_t);                            \
+  macro(int8_t);                             \
+  macro(::paddle::platform::float16);        \
+  macro(::paddle::platform::bfloat16);       \
+  macro(::paddle::platform::complex<float>); \
+  macro(::paddle::platform::complex<double>);
diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc
index 011c85caf04bbb3881a856caece3e3db70a055fc..c8e2acea451a473b757dcbd912bed1e9970e0bd1 100644
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
@@ -437,6 +437,8 @@ void TestConcatMain() {
   ConcatCase2<DeviceContext, Place>(context);
   ConcatCase3<DeviceContext, Place>(context);
   ConcatCase4<DeviceContext, Place>(context);
+
+  delete context;
 }
 
 TEST(math, concat) {
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
index d01a39ecb7c931e855c0ed954a0bc5e23732d168..1266ee7462d2d5cca38905bcfde54932f0f8efb5 100644
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -45,8 +45,6 @@ template struct SetConstant<platform::CPUDeviceContext, int>;
 template struct SetConstant<platform::CPUDeviceContext, int64_t>;
 template struct SetConstant<platform::CPUDeviceContext, bool>;
 template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext, platform::complex64>;
-template struct SetConstant<platform::CPUDeviceContext, platform::complex128>;
 template struct SetConstant<platform::CPUDeviceContext,
                             platform::complex<float>>;
 template struct SetConstant<platform::CPUDeviceContext,
@@ -61,35 +59,29 @@ template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
 template struct SetConstant<platform::XPUDeviceContext, int>;
 template struct SetConstant<platform::XPUDeviceContext, int64_t>;
 template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext, platform::complex64>;
-template struct SetConstant<platform::XPUDeviceContext, platform::complex128>;
 template struct SetConstant<platform::XPUDeviceContext,
                             platform::complex<float>>;
 template struct SetConstant<platform::XPUDeviceContext,
                             platform::complex<double>>;
 #endif
 
-#define DEFINE_CPU_TRANS(RANK)                                                \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16,    \
-                            RANK>;                                            \
-  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16,   \
-                            RANK>;                                            \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;         \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;           \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;          \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext,                       \
-                            platform::complex<float>, RANK>;                  \
-  template struct Transpose<platform::CPUDeviceContext,                       \
-                            platform::complex<double>, RANK>;                 \
-  template struct Transpose<platform::CPUDeviceContext, platform::complex64,  \
-                            RANK>;                                            \
-  template struct Transpose<platform::CPUDeviceContext, platform::complex128, \
-                            RANK>;
+#define DEFINE_CPU_TRANS(RANK)                                              \
+  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
+                            RANK>;                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
+  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<float>, RANK>;                \
+  template struct Transpose<platform::CPUDeviceContext,                     \
+                            platform::complex<double>, RANK>;
 
 DEFINE_CPU_TRANS(1);
 DEFINE_CPU_TRANS(2);
@@ -140,8 +132,6 @@ DEFINE_CPU_TRANS_NORMAL(bool);
 DEFINE_CPU_TRANS_NORMAL(int16_t);
 DEFINE_CPU_TRANS_NORMAL(uint8_t);
 DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex64);
-DEFINE_CPU_TRANS_NORMAL(platform::complex128);
 DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
 DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
 
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index c5c78c87f7977234fe25b2c1774157e2f55a8c84..248f62129991328fd59886192bd7de95bf2b3037 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -20,8 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/math_function_impl.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -30,8 +28,6 @@ namespace math {
 
 using float16 = paddle::platform::float16;
 using bfloat16 = paddle::platform::bfloat16;
-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
 
 template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
 template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
@@ -41,27 +37,23 @@ template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
 template struct SetConstant<platform::CUDADeviceContext, int>;
 template struct SetConstant<platform::CUDADeviceContext, int64_t>;
 template struct SetConstant<platform::CUDADeviceContext, bool>;
-template struct SetConstant<platform::CUDADeviceContext, platform::complex64>;
-template struct SetConstant<platform::CUDADeviceContext, platform::complex128>;
 template struct SetConstant<platform::CUDADeviceContext,
                             platform::complex<float>>;
 template struct SetConstant<platform::CUDADeviceContext,
                             platform::complex<double>>;
 
-#define DEFINE_GPU_TRANS(RANK)                                             \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;     \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;    \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;    \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext,                   \
-                            paddle::platform::complex<float>, RANK>;       \
-  template struct Transpose<platform::CUDADeviceContext,                   \
-                            paddle::platform::complex<double>, RANK>;      \
-  template struct Transpose<platform::CUDADeviceContext, complex64, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, complex128, RANK>;
+#define DEFINE_GPU_TRANS(RANK)                                            \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
+  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;  \
+  template struct Transpose<platform::CUDADeviceContext,                  \
+                            paddle::platform::complex<float>, RANK>;      \
+  template struct Transpose<platform::CUDADeviceContext,                  \
+                            paddle::platform::complex<double>, RANK>;
 
 DEFINE_GPU_TRANS(1);
 DEFINE_GPU_TRANS(2);
@@ -151,8 +143,6 @@ DEFINE_GPU_TRANS_NORMAL(bool);
 DEFINE_GPU_TRANS_NORMAL(int16_t);
 DEFINE_GPU_TRANS_NORMAL(uint8_t);
 DEFINE_GPU_TRANS_NORMAL(int8_t);
-DEFINE_GPU_TRANS_NORMAL(complex64);
-DEFINE_GPU_TRANS_NORMAL(complex128);
 DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
 DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
 
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc
index 3388d7edafecc4c0dd3a041316dc6f171d035319..32f9938dcacfbb0d314da912dc217949a544ea9b 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
@@ -208,6 +208,7 @@ void GemvTest(int m, int n, bool trans) {
       ASSERT_FLOAT_EQ(data_c[i], sum);
     }
   }
+  delete cpu_place;
 }
 
 TEST(math_function, gemv) {
@@ -274,6 +275,7 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
   for (int i = 0; i < mat_c_mkl.numel(); ++i) {
     EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
   }
+  delete cpu_place;
 }
 
 TEST(math_function, gemm_warp) {
diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h
index 379b21c3c18888989663221052e6e99df80e7e9d..529d39c9ba50f016434b0b14c4d85c84483bad7f 100644
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -29,7 +30,7 @@ template <typename DeviceContext, typename T, size_t D>
 void PadFunction(const framework::ExecutionContext& context,
                  const std::vector<int>& pads, const framework::Tensor& src,
                  T pad_value, framework::Tensor* out) {
-  Eigen::array<std::pair<int, int>, D> paddings;
+  std::array<std::pair<int64_t, int64_t>, D> paddings;
 
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = pads[i * 2];
@@ -41,14 +42,15 @@ void PadFunction(const framework::ExecutionContext& context,
 
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
-  out_tensor.device(place) = src_tensor.pad(paddings, pad_value);
+  EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, out_tensor, src_tensor, paddings, pad_value);
 }
 
 template <typename DeviceContext, typename T, size_t D>
 void PadGradFunction(const framework::ExecutionContext& context,
                      const std::vector<int>& pads, const framework::Tensor& src,
                      framework::Tensor* d_out) {
-  Eigen::array<std::pair<int, int>, D> paddings;
+  std::array<std::pair<int64_t, int64_t>, D> paddings;
   for (size_t i = 0; i < paddings.size(); ++i) {
     paddings[i].first = -pads[i * 2];
     paddings[i].second = -pads[i * 2 + 1];
@@ -58,7 +60,8 @@ void PadGradFunction(const framework::ExecutionContext& context,
   auto src_tensor = EigenTensor<T, D>::From(src);
   auto& place =
       *context.template device_context<DeviceContext>().eigen_device();
-  d_out_tensor.device(place) = src_tensor.pad(paddings, static_cast<T>(0));
+  EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+      place, d_out_tensor, src_tensor, paddings, static_cast<T>(0));
 }
 
 template <typename DeviceContext, typename T>
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index b9a1854a66118eac0a970117b92a4c80241fd7ba..ee405be5ae9a63a00f5809a97b4df1d4400d78f2 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -297,7 +297,9 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
 namespace scatter {
 
 template <typename T>
-typename std::enable_if<std::is_floating_point<T>::value>::type
+typename std::enable_if<std::is_floating_point<T>::value ||
+                        std::is_same<T, platform::complex<float>>::value ||
+                        std::is_same<T, platform::complex<double>>::value>::type
 elementwise_add_to(BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
                    const T* in, T* out) {
   blas->AXPY(data_len, T(1.f), in, out);
@@ -542,9 +544,9 @@ template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
 template struct MergeAdd<platform::CPUDeviceContext, float>;
 template struct MergeAdd<platform::CPUDeviceContext, double>;
 template struct MergeAdd<platform::CPUDeviceContext,
-                         paddle::platform::complex64>;
+                         paddle::platform::complex<float>>;
 template struct MergeAdd<platform::CPUDeviceContext,
-                         paddle::platform::complex128>;
+                         paddle::platform::complex<double>>;
 template struct MergeAdd<platform::CPUDeviceContext,
                          paddle::platform::bfloat16>;
 
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 26e9a0de606babfc325de58ba73404191751411c..f3ef537a31b44c70000020f8d1a54c63ba156bc6 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -448,8 +448,9 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::complex64>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::complex128>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
+template struct MergeAdd<platform::CUDADeviceContext,
+                         platform::complex<double>>;
 
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,
diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc
index cc3b838cbcf1d7a8be016cef91afdd22ef6b1a28..5a8e7fcc2a76c29ce02f856be007ddfc13f3e09f 100644
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
@@ -116,6 +116,9 @@ void testVol2col() {
   for (int i = 0; i < 12; ++i) {
     EXPECT_EQ(in_ptr[i], col_2_vol[i]);
   }
+
+  delete place;
+  delete context;
 }
 
 TEST(math, vol2col) {
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 6fccd3657af77eced2d11e97b96c865f6ab92e43..82706fd48752307b14baaa4f6158ae8366f5b1dc 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -204,15 +204,15 @@ REGISTER_OP_CPU_KERNEL(
     matmul_v2, ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext, double>,
     ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex64>,
+                        paddle::platform::complex<float>>,
     ops::MatMulV2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex128>);
+                        paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     matmul_v2_grad,
     ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex64>,
+                            paddle::platform::complex<float>>,
     ops::MatMulV2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex128>);
+                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu
index e819398ec9be9fec0dae9e35d1dbc414d0cc9cb3..2176ab79dd919dec17ca15c0297c87bf2a47e85e 100644
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
@@ -21,12 +21,12 @@ REGISTER_OP_CUDA_KERNEL(
     matmul_v2, ops::MatMulV2Kernel<plf::CUDADeviceContext, float>,
     ops::MatMulV2Kernel<plf::CUDADeviceContext, double>,
     ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::float16>,
-    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::complex64>,
-    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::complex128>);
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::complex<float>>,
+    ops::MatMulV2Kernel<plf::CUDADeviceContext, plf::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     matmul_v2_grad, ops::MatMulV2GradKernel<plf::CUDADeviceContext, float>,
     ops::MatMulV2GradKernel<plf::CUDADeviceContext, double>,
     ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::float16>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::complex64>,
-    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::complex128>);
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::complex<float>>,
+    ops::MatMulV2GradKernel<plf::CUDADeviceContext, plf::complex<double>>);
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
index ca20efaad074d76271e6c06992dcf0cc53a8739a..6061679b28893d23d08b32091409fe1f019c55f2 100644
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -483,19 +483,19 @@ struct ConjHelper {
 };
 
 template <typename DeviceContext>
-struct ConjHelper<DeviceContext, paddle::platform::complex64> {
+struct ConjHelper<DeviceContext, paddle::platform::complex<float>> {
   explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {}
 
   HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) {
     dst.Resize(src.dims());
-    auto* src_data = src.data<paddle::platform::complex64>();
-    auto* dst_data = dst.mutable_data<paddle::platform::complex64>(
+    auto* src_data = src.data<paddle::platform::complex<float>>();
+    auto* dst_data = dst.mutable_data<paddle::platform::complex<float>>(
         ctx_.GetPlace(),
-        size_t(src.numel() * sizeof(paddle::platform::complex64)));
+        size_t(src.numel() * sizeof(paddle::platform::complex<float>)));
 
     platform::ForRange<DeviceContext> for_range(
         ctx_.template device_context<DeviceContext>(), src.numel());
-    math::ConjFunctor<paddle::platform::complex64> functor(
+    math::ConjFunctor<paddle::platform::complex<float>> functor(
         src_data, src.numel(), dst_data);
     for_range(functor);
     return;
@@ -504,19 +504,19 @@ struct ConjHelper<DeviceContext, paddle::platform::complex64> {
 };
 
 template <typename DeviceContext>
-struct ConjHelper<DeviceContext, paddle::platform::complex128> {
+struct ConjHelper<DeviceContext, paddle::platform::complex<double>> {
   explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {}
 
   HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) {
     dst.Resize(src.dims());
-    auto* src_data = src.data<paddle::platform::complex128>();
-    auto* dst_data = dst.mutable_data<paddle::platform::complex128>(
+    auto* src_data = src.data<paddle::platform::complex<double>>();
+    auto* dst_data = dst.mutable_data<paddle::platform::complex<double>>(
         ctx_.GetPlace(),
-        size_t(src.numel() * sizeof(paddle::platform::complex128)));
+        size_t(src.numel() * sizeof(paddle::platform::complex<double>)));
 
     platform::ForRange<DeviceContext> for_range(
         ctx_.template device_context<DeviceContext>(), src.numel());
-    math::ConjFunctor<paddle::platform::complex128> functor(
+    math::ConjFunctor<paddle::platform::complex<double>> functor(
         src_data, src.numel(), dst_data);
     for_range(functor);
     return;
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index d3022056a47ded99e63aa05c1aca8e9b31ccc3fe..f499c24ea3206cb4e9b570753bd0d7804942e6dd 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -34,7 +34,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     if (x->dims().size() == 2) {
       out->mutable_data<T>(ctx.GetPlace());
 
-      auto runner = NpuOpRunner(
+      const auto& runner = NpuOpRunner(
           "MatMul", {*x, *y}, {*out},
           {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
 
@@ -46,7 +46,7 @@ class MatMulV2NPUKernel : public framework::OpKernel<T> {
     } else if (x->dims().size() > 2) {
       out->mutable_data<T>(ctx.GetPlace());
 
-      auto runner =
+      const auto& runner =
           NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
                       {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
 
@@ -76,7 +76,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
       if (transpose_y) {
         if (dx) {
           dx->mutable_data<T>(ctx.GetPlace());
-          auto runner_dx =
+          const auto& runner_dx =
               NpuOpRunner("MatMul", {*dout, *y}, {*dx},
                           {{"transpose_x1", false}, {"transpose_x2", false}});
 
@@ -84,7 +84,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
         }
         if (dy) {
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy =
+          const auto& runner_dy =
               NpuOpRunner("MatMul", {*dout, *x}, {*dy},
                           {{"transpose_x1", true}, {"transpose_x2", false}});
 
@@ -94,7 +94,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
       } else {
         if (dx) {
           dx->mutable_data<T>(ctx.GetPlace());
-          auto runner_dx =
+          const auto& runner_dx =
               NpuOpRunner("MatMul", {*dout, *y}, {*dx},
                           {{"transpose_x1", false}, {"transpose_x2", true}});
 
@@ -102,7 +102,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
         }
         if (dy) {
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy =
+          const auto& runner_dy =
               NpuOpRunner("MatMul", {*x, *dout}, {*dy},
                           {{"transpose_x1", true}, {"transpose_x2", false}});
 
@@ -113,30 +113,34 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
       if (transpose_y) {
         if (dx) {
           dx->mutable_data<T>(ctx.GetPlace());
-          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                                       {{"adj_x1", false}, {"adj_x2", false}});
+          const auto& runner_dx =
+              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                          {{"adj_x1", false}, {"adj_x2", false}});
 
           runner_dx.Run(stream);
         }
         if (dy) {
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
-                                       {{"adj_x1", true}, {"adj_x2", false}});
+          const auto& runner_dy =
+              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
+                          {{"adj_x1", true}, {"adj_x2", false}});
 
           runner_dy.Run(stream);
         }
       } else {
         if (dx) {
           dx->mutable_data<T>(ctx.GetPlace());
-          auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
-                                       {{"adj_x1", false}, {"adj_x2", true}});
+          const auto& runner_dx =
+              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                          {{"adj_x1", false}, {"adj_x2", true}});
 
           runner_dx.Run(stream);
         }
         if (dy) {
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
-                                       {{"adj_x1", true}, {"adj_x2", false}});
+          const auto& runner_dy =
+              NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
+                          {{"adj_x1", true}, {"adj_x2", false}});
           runner_dy.Run(stream);
         }
       }
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index d6e982039fa290ae9095fe380fa22955c6acde70..ab0a3336b361f8c7127019e424b2bf72c6b35385 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -30,7 +30,7 @@ class MeanNPUKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
+    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -61,7 +61,7 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
     // ones
     Tensor ones(grad->type());
     ones.mutable_data<T>(IG->dims(), context.GetPlace());
-    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
     runner_ones.Run(stream);
 
     // means
@@ -75,11 +75,12 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
     Tensor mean_ma(grad->type());
     mean_ma.Resize(IG->dims());
     mean_ma.mutable_data<T>(context.GetPlace());
-    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    const auto& runner_mul_1 =
+        NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
     runner_mul_1.Run(stream);
 
     // and mul grad
-    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
     runner_mul_2.Run(stream);
   }
 };
diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
index c18b8590db18da7d21864ee7b4f32a831a6daa5f..f3cab995a08b611c64ec9e3abf9235da8a066eec 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
@@ -47,7 +47,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
       if (indices->type() != framework::proto::VarType::INT32) {
         cast_indices.Resize(indices->dims());
         cast_indices.mutable_data<int>(ctx.GetPlace());
-        auto runner_cast_indices =
+        const auto& runner_cast_indices =
             NpuOpRunner("Cast", {*indices}, {cast_indices},
                         {{"dst_type", static_cast<int>(dst_dtype)}});
         runner_cast_indices.Run(stream);
@@ -57,7 +57,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
       if (label->type() != framework::proto::VarType::INT32) {
         cast_label.Resize(label->dims());
         cast_label.mutable_data<int>(ctx.GetPlace());
-        auto runner_cast_label =
+        const auto& runner_cast_label =
             NpuOpRunner("Cast", {*label}, {cast_label},
                         {{"dst_type", static_cast<int>(dst_dtype)}});
         runner_cast_label.Run(stream);
@@ -73,7 +73,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_equal(framework::proto::VarType::BOOL);
     tmp_equal.Resize(inference->dims());
     tmp_equal.mutable_data<bool>(ctx.GetPlace());
-    auto runner_equal =
+    const auto& runner_equal =
         NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {});
     runner_equal.Run(stream);
 
@@ -81,7 +81,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_equal_cast(framework::proto::VarType::FP32);
     tmp_equal_cast.Resize(inference->dims());
     tmp_equal_cast.mutable_data<float>(ctx.GetPlace());
-    auto runner_cast_equal = NpuOpRunner(
+    const auto& runner_cast_equal = NpuOpRunner(
         "Cast", {tmp_equal}, {tmp_equal_cast},
         {{"dst_type",
           static_cast<int>(ConvertToNpuDtype(tmp_equal_cast.type()))}});
@@ -92,7 +92,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_correct_max(framework::proto::VarType::FP32);
     tmp_correct_max.Resize(framework::make_ddim({num_samples}));
     tmp_correct_max.mutable_data<float>(ctx.GetPlace());
-    auto runner_reduce_max =
+    const auto& runner_reduce_max =
         NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max},
                     {{"axes", std::vector<int>{1}}, {"keep_dims", false}});
     runner_reduce_max.Run(stream);
@@ -101,14 +101,14 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_correct(framework::proto::VarType::FP32);
     tmp_correct.Resize(correct->dims());
     tmp_correct.mutable_data<float>(ctx.GetPlace());
-    auto runner_reduce_sum =
+    const auto& runner_reduce_sum =
         NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct},
                     {{"axes", std::vector<int>{0}}, {"keep_dims", false}});
     runner_reduce_sum.Run(stream);
 
     // cast to int
     correct->mutable_data<int>(ctx.GetPlace());
-    auto runner_cast_correct = NpuOpRunner(
+    const auto& runner_cast_correct = NpuOpRunner(
         "Cast", {tmp_correct}, {*correct},
         {{"dst_type", static_cast<int>(ConvertToNpuDtype(correct->type()))}});
     runner_cast_correct.Run(stream);
@@ -126,7 +126,7 @@ class AccuracyNPUKernel : public framework::OpKernel<T> {
 
     // [accuracy]
     accuracy->mutable_data<float>(ctx.GetPlace());
-    auto runner_accuracy =
+    const auto& runner_accuracy =
         NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {});
     runner_accuracy.Run(stream);
   }
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 5b14d4f6872439325fab505d7e1972e39fe737e3..743a61c744be711ce2e05e16c6e456127e69fc3f 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -146,3 +146,6 @@ REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker,
                   ops::MinusGradDescMaker, ops::MinusGradMaker);
 REGISTER_OP_CPU_KERNEL(
     minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/operators/minus_op.cu
deleted file mode 100644
index 956d935da9b96696e9148fc4dfab23a6a6c29016..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/minus_op.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/minus_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    minus,
-    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h
index 7791b1456a81516e48db645501c717d9c4cf8749..2300506c623ee2c5cbbeb502e80cf10838182a2a 100644
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -30,9 +31,10 @@ class MinusKernel : public framework::OpKernel<T> {
     out_tensor->mutable_data<T>(context.GetPlace());
     auto& dev =
         *context.template device_context<DeviceContext>().eigen_device();
-    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
-        framework::EigenVector<T>::Flatten(*left_tensor) -
-        framework::EigenVector<T>::Flatten(*right_tensor);
+    EigenSub<std::decay_t<decltype(dev)>, T>::Eval(
+        dev, framework::EigenVector<T>::Flatten(*out_tensor),
+        framework::EigenVector<T>::Flatten(*left_tensor),
+        framework::EigenVector<T>::Flatten(*right_tensor));
   }
 };
 
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 73530eac09e99c695ad8185d694ee9e7a4ed4396..fed6a7dfa5e1ce408d954ce3576bedc7e96b0d35 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -74,7 +74,9 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16,
 
 template <typename T, typename K, typename T_out>
 class ConvMKLDNNHandlerT
-    : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward> {
+    : public platform::MKLDNNHandlerT<T, mkldnn::convolution_forward,
+                                      mkldnn::convolution_backward_data,
+                                      mkldnn::convolution_backward_weights> {
  public:
   ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx,
                      const platform::MKLDNNDeviceContext& dev_ctx,
@@ -82,11 +84,13 @@ class ConvMKLDNNHandlerT
                      platform::Place cpu_place, const Tensor* input,
                      const Tensor* filter, const Tensor* bias, Tensor* output,
                      const std::string& unique_name)
-      : platform::MKLDNNHandlerT<T, mkldnn::convolution_forward>(
+      : platform::MKLDNNHandlerT<T, mkldnn::convolution_forward,
+                                 mkldnn::convolution_backward_data,
+                                 mkldnn::convolution_backward_weights>(
             dev_ctx, mkldnn_engine, cpu_place,
             platform::CreateKey(dev_ctx, framework::vectorize(input->dims()),
                                 unique_name)) {
-    if (!this->isCached()) {
+    if (!this->isCachedNonBlocking()) {
       PADDLE_ENFORCE_EQ(
           input->layout(), DataLayout::kMKLDNN,
           platform::errors::InvalidArgument(
@@ -224,12 +228,12 @@ class ConvMKLDNNHandlerT
         auto bias_md =
             platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x);
 
-        this->AcquireForwardPrimitiveDescriptor(
+        this->AcquireForwardPrimitiveDescriptorNonBlocking(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
             src_md, weights_md, bias_md, dst_md, stride_dims, dilations_dims,
             mkldnn_paddings[0], mkldnn_paddings[1]);
       } else {
-        this->AcquireForwardPrimitiveDescriptor(
+        this->AcquireForwardPrimitiveDescriptorNonBlocking(
             conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct,
             src_md, weights_md, dst_md, stride_dims, dilations_dims,
             mkldnn_paddings[0], mkldnn_paddings[1]);
@@ -237,6 +241,142 @@ class ConvMKLDNNHandlerT
     }
   }
 
+  ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx,
+                     const platform::MKLDNNDeviceContext& dev_ctx,
+                     platform::Place cpu_place, const Tensor* in,
+                     const Tensor* filter, const Tensor* bias,
+                     const Tensor* out_grad, Tensor* filter_grad,
+                     Tensor* in_x_grad, const std::string& unique_name)
+      : platform::MKLDNNHandlerT<T, mkldnn::convolution_forward,
+                                 mkldnn::convolution_backward_data,
+                                 mkldnn::convolution_backward_weights>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(dev_ctx, framework::vectorize(in->dims()),
+                                unique_name)) {
+    if (!this->isBwdCached()) {
+      PADDLE_ENFORCE_EQ(
+          in->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument(
+              "The input tensor's layout should be %d, but got %d.",
+              DataLayout::kMKLDNN, in->layout()));
+      PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef,
+                        platform::errors::InvalidArgument(
+                            "Got wrong format for Input tensor."));
+
+      PADDLE_ENFORCE_EQ(
+          filter->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument(
+              "The filter tensor's layout should be %d, but got %d.",
+              DataLayout::kMKLDNN, filter->layout()));
+      PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
+                        platform::errors::InvalidArgument(
+                            "Got wrong format for Filter tensor."));
+
+      PADDLE_ENFORCE_EQ(
+          out_grad->layout(), DataLayout::kMKLDNN,
+          platform::errors::InvalidArgument(
+              "The output_grad tensor's layout should be %d, but got %d.",
+              DataLayout::kMKLDNN, out_grad->layout()));
+      PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef,
+                        platform::errors::InvalidArgument(
+                            "Wrong format set for output_grad tensor"));
+
+      PADDLE_ENFORCE_EQ(
+          ctx.Attr<bool>("is_test"), false,
+          platform::errors::InvalidArgument(
+              "is_test attribute should be set to False in training phase."));
+
+      std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
+      std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
+
+      std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
+      std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
+
+      std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
+      std::vector<int64_t> dilations(begin(dilations_temp),
+                                     end(dilations_temp));
+
+      std::string padding_algorithm =
+          ctx.Attr<std::string>("padding_algorithm");
+
+      int groups = ctx.Attr<int>("groups");
+
+      auto input_dims = in->dims();
+      auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
+      auto filter_dims = filter->dims();
+      auto filter_data_dims =
+          framework::slice_ddim(filter_dims, 2, filter_dims.size());
+
+      auto ksize = framework::vectorize(filter_data_dims);
+
+      UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
+                               data_dims, strides, ksize);
+
+      auto src_tz = framework::vectorize(in->dims());
+      auto weights_tz = framework::vectorize(filter->dims());
+
+      int g = std::max(groups, 1);
+      platform::GetGroupConvWeightsTz(weights_tz, g);
+      auto dst_tz = paddle::framework::vectorize(out_grad->dims());
+
+      /* create memory descriptor for conv backward without specified format
+       * ('any') which lets a primitive (conv backward in this case) choose
+       * the memory format preferred for best performance
+       */
+      const auto chosen_memory_format = MKLDNNMemoryFormat::any;
+      const auto weights_format = MKLDNNMemoryFormat::any;
+
+      auto src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+      const auto dst_md = platform::MKLDNNMemDesc(
+          dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
+      auto diff_src_md = platform::MKLDNNMemDesc(
+          src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+      auto weights_md = platform::MKLDNNMemDesc(
+          weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
+      auto diff_weights_md = platform::MKLDNNMemDesc(
+          weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
+      auto diff_dst_md = platform::MKLDNNMemDesc(
+          dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+
+      auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
+      std::transform(dilations.begin(), dilations.end(), dilations.begin(),
+                     [](int64_t i) { return i - 1; });
+      const mkldnn::memory::dims dilations_dims = dilations;
+
+      const mkldnn::memory::dims stride_dims = strides;
+      // Recreating FWD PD. For training there are no post ops in convolution
+      mkldnn::primitive_attr conv_attr;
+      if (bias) {
+        auto bias_tz = framework::vectorize(bias->dims());
+        auto bias_md = platform::MKLDNNMemDesc(
+            bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x);
+
+        this->AcquireForwardPrimitiveDescriptorNonBlocking(
+            conv_attr, mkldnn::prop_kind::forward_training,
+            dnnl::algorithm::convolution_direct, src_md, weights_md, bias_md,
+            dst_md, stride_dims, dilations_dims, mkldnn_paddings[0],
+            mkldnn_paddings[1]);
+      } else {
+        this->AcquireForwardPrimitiveDescriptorNonBlocking(
+            conv_attr, mkldnn::prop_kind::forward_training,
+            dnnl::algorithm::convolution_direct, src_md, weights_md, dst_md,
+            stride_dims, dilations_dims, mkldnn_paddings[0],
+            mkldnn_paddings[1]);
+      }
+
+      this->AcquireBackwardPrimitiveDescriptorNonBlocking(
+          mkldnn::algorithm::convolution_direct, diff_src_md, weights_md,
+          diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
+          mkldnn_paddings[1]);
+
+      this->AcquireBackwardWeightsPrimitiveDescriptorNonBlocking(
+          mkldnn::algorithm::convolution_direct, src_md, diff_weights_md,
+          diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
+          mkldnn_paddings[1]);
+    }
+  }
+
   mkldnn::primitive_attr CreatePostOps(
       std::string fuse_activation, float fuse_alpha, float fuse_beta,
       bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
@@ -280,27 +420,75 @@ class ConvMKLDNNHandlerT
     return conv_attr;
   }
 
+  std::shared_ptr<mkldnn::memory>
+  AcquireWeightsMemoryWithReorderFromDataPrimitive(
+      const framework::Tensor* filter, const int groups, const bool is_conv3d) {
+    const K* filter_data = filter->data<K>();
+    auto weights_tz = framework::vectorize(filter->dims());
+    platform::GetGroupConvWeightsTz(weights_tz, groups);
+
+    auto user_src_md = platform::MKLDNNMemDesc(
+        weights_tz, platform::MKLDNNGetDataType<K>(),
+        GetWeightsFormat(filter->format(), groups, is_conv3d));
+
+    return this->AcquireMemoryWithReorder(
+        user_src_md, this->bwd_pd_->weights_desc(),
+        to_void_cast<K>(filter_data), "@weights_mem_d_p", false);
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireSrcMemoryWithReorder(
       const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    const std::string user_key_suffix{"@src_mem_p_user"};
-    auto user_src_mem_p = this->AcquireMemory(user_key_suffix);
+    return this->AcquireMemoryWithReorderPrimitive(
+        input, "@src_mem_p_user", "@src_mem_p_target", "@src_mem_p",
+        this->fwd_pd_->src_desc());
+  }
 
-    if (!user_src_mem_p) {
-      auto user_src_md = platform::MKLDNNMemDesc(
-          framework::vectorize(input->dims()), platform::MKLDNNGetDataType<T>(),
-          input->format());
+  std::shared_ptr<mkldnn::memory>
+  AcquireSrcMemoryWithReorderFromWeightsPrimitive(
+      const framework::Tensor* input) {
+    return this->AcquireMemoryWithReorderPrimitive(
+        input, "@src_mem_w_p_user", "@src_mem_w_p_target", "@src_mem_w_p",
+        this->bwd_w_pd_->src_desc());
+  }
+
+  std::shared_ptr<mkldnn::memory>
+  AcquireDiffDstMemoryWithReorderFromWeightsPrimitive(
+      const framework::Tensor* out_grad) {
+    return this->AcquireMemoryWithReorderPrimitive(
+        out_grad, "@diff_dst_mem_w_p_user", "@diff_dst_mem_w_p_target",
+        "@diff_dst_mem_w_p", this->bwd_w_pd_->diff_dst_desc());
+  }
+
+  std::shared_ptr<mkldnn::memory>
+  AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive(
+      const framework::Tensor* out_grad) {
+    return this->AcquireMemoryWithReorderPrimitive(
+        out_grad, "@diff_dst_mem_p_user", "@diff_dst_mem_p_target",
+        "@diff_dst_mem_p", this->bwd_pd_->diff_dst_desc());
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireMemoryWithReorderPrimitive(
+      const framework::Tensor* in_mem, const char* key_mem_user,
+      const char* key_mem_target, const char* key_mem,
+      const mkldnn::memory::desc& mem_md) {
+    const T* in_mem_data = in_mem->data<T>();
+    const std::string user_key_suffix{key_mem_user};
+    auto user_mem_p = this->AcquireMemory(user_key_suffix);
+
+    if (!user_mem_p) {
+      auto user_mem_md = platform::MKLDNNMemDesc(
+          framework::vectorize(in_mem->dims()),
+          platform::MKLDNNGetDataType<T>(), in_mem->format());
       return this->AcquireMemoryWithReorder(
-          user_src_md, this->fwd_pd_->src_desc(), to_void_cast<T>(input_data),
-          "@src_mem_p");
+          user_mem_md, mem_md, to_void_cast<T>(in_mem_data), key_mem);
     } else {
-      const std::string target_key_suffix{"@src_mem_p_target"};
-      const auto target_src_mem_p = this->AcquireMemory(target_key_suffix);
-      user_src_mem_p->set_data_handle(to_void_cast<T>(input_data));
-      if (user_src_mem_p != target_src_mem_p) {
-        this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p");
+      const std::string target_key_suffix{key_mem_target};
+      const auto target_mem_p = this->AcquireMemory(target_key_suffix);
+      user_mem_p->set_data_handle(to_void_cast<T>(in_mem_data));
+      if (user_mem_p != target_mem_p) {
+        this->AcquireReorder(user_mem_p, target_mem_p, key_mem);
       }
-      return target_src_mem_p;
+      return target_mem_p;
     }
   }
 
@@ -866,7 +1054,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
-template <typename T>
+template <typename T, typename K>
 class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
@@ -879,189 +1067,44 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 
     const Tensor* input = ctx.Input<Tensor>("Input");
     const Tensor* filter = ctx.Input<Tensor>("Filter");
+    const Tensor* bias =
+        ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
     const Tensor* output_grad =
         ctx.Input<Tensor>(framework::GradVarName("Output"));
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
 
-    PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN,
-                      platform::errors::InvalidArgument(
-                          "The input tensor's layout should be %d, but got %d.",
-                          DataLayout::kMKLDNN, input->layout()));
-    PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Got wrong format for Input tensor."));
-
-    PADDLE_ENFORCE_EQ(
-        filter->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "The filter tensor's layout should be %d, but got %d.",
-            DataLayout::kMKLDNN, filter->layout()));
-    PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Got wrong format for Filter tensor."));
-
-    PADDLE_ENFORCE_EQ(
-        output_grad->layout(), DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument(
-            "The output_grad tensor's layout should be %d, but got %d.",
-            DataLayout::kMKLDNN, output_grad->layout()));
-    PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef,
-                      platform::errors::InvalidArgument(
-                          "Wrong format set for output_grad tensor"));
-
-    PADDLE_ENFORCE_EQ(
-        ctx.Attr<bool>("is_test"), false,
-        platform::errors::InvalidArgument(
-            "is_test attribute should be set to False in training phase."));
-
     if (!input_grad && !filter_grad) return;
 
-    std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
-    std::vector<int64_t> strides(begin(strides_temp), end(strides_temp));
-
-    std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
-    std::vector<int64_t> paddings(begin(paddings_temp), end(paddings_temp));
-
-    std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
-    std::vector<int64_t> dilations(begin(dilations_temp), end(dilations_temp));
-
-    std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
-
-    int groups = ctx.Attr<int>("groups");
-
-    bool is_conv3d = strides.size() == 3U;
-    const T* input_data = input->data<T>();
-    const T* filter_data = filter->data<T>();
-    const T* output_grad_data = output_grad->data<T>();
-    T* input_grad_data = nullptr;
-    T* filter_grad_data = nullptr;
-
-    auto input_dims = input->dims();
-    auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size());
-    auto filter_dims = filter->dims();
-    auto filter_data_dims =
-        framework::slice_ddim(filter_dims, 2, filter_dims.size());
-
-    auto ksize = framework::vectorize(filter_data_dims);
-
-    UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
-                             data_dims, strides, ksize);
-
-    auto src_tz = paddle::framework::vectorize(input->dims());
-    auto weights_tz = paddle::framework::vectorize(filter->dims());
-
-    int g = std::max(groups, 1);
-    platform::GetGroupConvWeightsTz(weights_tz, g);
-    auto dst_tz = paddle::framework::vectorize(output_grad->dims());
-
-    auto src_format = input->format();
-    MKLDNNMemoryFormat weights_format =
-        GetWeightsFormat(filter->format(), g, is_conv3d);
-
-    // Get an unique name from "argument" name of "input" and "Filter" variable
-    // as well as attributes of primitive to be created
-    // This name will be used as key when saving info into device context
-    std::string key = platform::CreateKey(
-        dev_ctx, src_tz, ctx.InputName("Input") + ctx.InputName("Filter"));
-
-    const std::string key_conv_pd = key + "@fwd_pd";
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
-    std::vector<primitive> pipeline;
-
-    // Create user memory descriptors
-    auto user_src_md = platform::MKLDNNMemDesc(
-        {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
-    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto user_diff_dst_md = platform::MKLDNNMemDesc(
-        {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
-
-    /* create memory descriptor for conv backward without specified format
-     * ('any') which lets a primitive (conv backward in this case) choose
-     * the memory format preferred for best performance
-     */
-    auto chosen_memory_format = MKLDNNMemoryFormat::any;
-    weights_format = MKLDNNMemoryFormat::any;
-
-    auto src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto diff_src_md = platform::MKLDNNMemDesc(
-        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto diff_weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
-    auto diff_dst_md = platform::MKLDNNMemDesc(
-        dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
-    // Retrieve conv_pd from device context
-    auto conv_pd =
-        std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
-            dev_ctx.GetBlob(key_conv_pd));
-    PADDLE_ENFORCE_NE(conv_pd, nullptr,
-                      platform::errors::InvalidArgument(
-                          "Fail to find conv_pd in device context"));
-
-    auto mkldnn_paddings = platform::ToMkldnnPadding(paddings);
-    std::transform(dilations.begin(), dilations.end(), dilations.begin(),
-                   [](int64_t i) { return i - 1; });
-    const mkldnn::memory::dims dilations_dims = dilations;
-    // create backward convolution weights primitive descriptor
-    auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
-        mkldnn::algorithm::convolution_direct, src_md, diff_weights_md,
-        diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
-        mkldnn_paddings[1]);
-
-    auto conv_bwd_weights_pd =
-        std::make_shared<mkldnn::convolution_backward_weights::primitive_desc>(
-            conv_bwd_weights_desc, mkldnn_engine, *conv_pd);
-
-    // create backward convolution data primitive descriptor
-    auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
-        mkldnn::algorithm::convolution_direct, diff_src_md, weights_md,
-        diff_dst_md, strides, dilations_dims, mkldnn_paddings[0],
-        mkldnn_paddings[1]);
-
-    auto conv_bwd_data_pd =
-        std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
-            conv_bwd_data_desc, mkldnn_engine, *conv_pd);
-
-    platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
-                                        conv_bwd_weights_pd, dev_ctx,
-                                        mkldnn_engine, key);
+    // TODO(jczaja): Are all tensors really needed?
+    ConvMKLDNNHandlerT<T, K, T> handler(
+        ctx, dev_ctx, ctx.GetPlace(), input, filter, bias, output_grad,
+        filter_grad, input_grad,
+        ctx.InputName("Input") + ctx.InputName("Filter"));
 
     // create mkldnn memory from input tensors (data/weights)
-    auto user_src_memory_p =
-        handler.AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
-    auto user_weights_memory_p = handler.AcquireWeightsMemory(
-        user_weights_md, to_void_cast<T>(filter_data));
-    auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory(
-        user_diff_dst_md, to_void_cast<T>(output_grad_data));
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    if (filter_grad) {
-      auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive(
-          user_src_memory_p, pipeline);
-
-      auto diff_dst_memory_4filter_p =
-          handler.AcquireDiffDstMemoryFromWeightsPrimitive(
-              user_diff_dst_memory_p, pipeline);
 
-      const size_t size = handler.GetDiffWeightsMemorySize();
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+    if (filter_grad) {
+      auto src_memory_p =
+          handler.AcquireSrcMemoryWithReorderFromWeightsPrimitive(input);
+      auto diff_dst_memory_p =
+          handler.AcquireDiffDstMemoryWithReorderFromWeightsPrimitive(
+              output_grad);
 
       // For convoluition with groups write filter grad into
       // oneDNN buffer and then we reorder it into filter_grad tensor
+      int g = std::max(ctx.Attr<int>("groups"), 1);
       auto diff_weights_memory_p =
-          g > 1 ? handler.AcquireDiffWeightsMemoryFromWeightsPrimitive()
-                : handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
-                      reinterpret_cast<void*>(filter_grad_data));
+          g > 1 ? handler.AcquireDiffWeightsMemory()
+                : handler.AcquireDiffWeightsMemory(filter_grad);
 
-      auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights();
+      auto conv_bwd_weights_p = handler.AcquireBackwardWeightsPrimitive();
 
       // TODO(grygielski) why no bias_diff?
       conv_bwd_weights_p->execute(
           astream, {{MKLDNN_ARG_SRC, *src_memory_p},
-                    {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4filter_p},
+                    {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p},
                     {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}});
       astream.wait();
 
@@ -1073,10 +1116,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       // For convolution with groups convert from blocked to NCHW
       // otherwise there will be problems in next operators working on this data
       if (g > 1) {
-        memory::data_type in_type =
-            framework::ToMKLDNNDataType(filter_grad->type());
+        memory::data_type in_type = framework::ToMKLDNNDataType(filter->type());
         // for 3d conv with groups (six dimensional data reorder to goidhw)
         // for 2d conv with groups (five dimensional data reorder to goihw)
+        // auto weights_tz = paddle::framework::vectorize(filter->dims());
+
+        auto weights_tz = diff_weights_memory_p->get_desc().dims();
         mkldnn::memory::format_tag out_format =
             weights_tz.size() == 6 ? mkldnn::memory::format_tag::goidhw
                                    : mkldnn::memory::format_tag::goihw;
@@ -1084,9 +1129,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                               out_format, in_type);
         key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
 
-        platform::ReorderMKLDNNHandler handler(weights_tz, filter_grad->type(),
-                                               in_type, dev_ctx, mkldnn_engine,
-                                               key);
+        platform::ReorderMKLDNNHandler handler(
+            weights_tz, filter->type(), in_type, dev_ctx, mkldnn_engine, key);
         auto reorder_dst_memory_p =
             handler.AcquireDstMemory(filter_grad, out_format, ctx.GetPlace());
 
@@ -1113,24 +1157,21 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       }
     }
     if (input_grad) {
-      auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
-          user_weights_memory_p, pipeline);
-
-      auto diff_dst_memory_4data_p =
-          handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p,
-                                                        pipeline);
-
-      const size_t size = handler.GetDiffSourceMemorySize();
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+      auto weights_memory_p =
+          handler.AcquireWeightsMemoryWithReorderFromDataPrimitive(
+              filter, ctx.Attr<int>("groups"),
+              ctx.Attr<std::vector<int>>("strides").size() == 3U);
 
-      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
-          reinterpret_cast<void*>(input_grad_data));
+      auto diff_dst_memory_p =
+          handler.AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive(
+              output_grad);
+      auto diff_src_memory_p = handler.AcquireDiffSrcMemory(input_grad);
 
-      auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData();
+      auto conv_bwd_data_p = handler.AcquireBackwardPrimitive();
 
       conv_bwd_data_p->execute(astream,
                                {{MKLDNN_ARG_WEIGHTS, *weights_memory_p},
-                                {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4data_p},
+                                {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p},
                                 {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}});
       astream.wait();
 
@@ -1167,7 +1208,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNGradOpKernel<float>);
+                                    ops::ConvMKLDNNGradOpKernel<float, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
@@ -1177,4 +1218,4 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
                                     ::paddle::platform::CPUPlace, FP32,
                                     ops::kConvMKLDNNFP32,
-                                    ops::ConvMKLDNNGradOpKernel<float>);
+                                    ops::ConvMKLDNNGradOpKernel<float, float>);
diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc
index e0736239d40f289a11a1e1fd8380fcbad904a667..cfa75bc1ce1c4ce7fd7ada3a139c25ee2d840044 100644
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
@@ -36,7 +36,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
     if (x_num_col_dims == 1 && y_num_col_dims == 1) {
       if (x->dims().size() == 2 && y->dims().size() == 2) {
         out->mutable_data<T>(ctx.GetPlace());
-        auto runner =
+        const auto& runner =
             NpuOpRunner("MatMul", {*x, *y}, {*out},
                         {{"transpose_x1", false}, {"transpose_x2", false}});
 
@@ -54,7 +54,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
         tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
         out->mutable_data<T>(ctx.GetPlace());
         // matmul
-        auto runner =
+        const auto& runner =
             NpuOpRunner("MatMul", {tmp_x, *y}, {*out},
                         {{"transpose_x1", false}, {"transpose_x2", false}});
         runner.Run(stream);
@@ -85,7 +85,7 @@ class MulNPUKernel : public framework::OpKernel<T> {
       tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]}));
       tmp_matmul.mutable_data<T>(ctx.GetPlace());
 
-      auto runner_matmul =
+      const auto& runner_matmul =
           NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul},
                       {{"transpose_x1", false}, {"transpose_x2", false}});
 
@@ -121,7 +121,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
       if (x->dims().size() == 2 && y->dims().size() == 2) {
         if (dx) {
           dx->mutable_data<T>(ctx.GetPlace());
-          auto runner_dx =
+          const auto& runner_dx =
               NpuOpRunner("MatMul", {*dout, *y}, {*dx},
                           {{"transpose_x1", false}, {"transpose_x2", true}});
 
@@ -130,7 +130,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
 
         if (dy) {
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy =
+          const auto& runner_dy =
               NpuOpRunner("MatMul", {*x, *dout}, {*dy},
                           {{"transpose_x1", true}, {"transpose_x2", false}});
 
@@ -144,7 +144,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
           dx->mutable_data<T>(ctx.GetPlace());
           auto dx_dims = dx->dims();
           dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]}));
-          auto runner_matmul =
+          const auto& runner_matmul =
               NpuOpRunner("MatMul", {*dout, *y}, {*dx},
                           {{"transpose_x1", false}, {"transpose_x2", true}});
           runner_matmul.Run(stream);
@@ -164,7 +164,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
               ctx.template device_context<platform::DeviceContext>(), &tmp_x);
           tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
           dy->mutable_data<T>(ctx.GetPlace());
-          auto runner_dy =
+          const auto& runner_dy =
               NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy},
                           {{"transpose_x1", true}, {"transpose_x2", false}});
 
@@ -193,7 +193,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
         dx->mutable_data<T>(ctx.GetPlace());
         auto dx_dims = dx->dims();
         dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]}));
-        auto runner_matmul =
+        const auto& runner_matmul =
             NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx},
                         {{"transpose_x1", false}, {"transpose_x2", true}});
         runner_matmul.Run(stream);
@@ -213,7 +213,7 @@ class MulGradNPUKernel : public framework::OpKernel<T> {
         tmp_x.Resize(framework::make_ddim({first_dim, sec_dim}));
         // mamtul [6,4] [6,5] =>[4,5]
         dy->mutable_data<T>(ctx.GetPlace());
-        auto runner_dy =
+        const auto& runner_dy =
             NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy},
                         {{"transpose_x1", true}, {"transpose_x2", false}});
         runner_dy.Run(stream);
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 276bfa7b3281b9886c6561187c48aec4e9e847c5..a6ea656cfcddb393364c295a25598b5b3a0cf96e 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -89,7 +89,21 @@ NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs,
 }
 
 NpuOpRunner::~NpuOpRunner() {
-  // TODO(zhiqiu): handle free
+  VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
+  // Is it safe to free the descs/buffers after run called in host ?
+  aclopDestroyAttr(attr_);  // return void
+  for (auto desc : input_descs_) {
+    aclDestroyTensorDesc(desc);
+  }
+  for (auto desc : output_descs_) {
+    aclDestroyTensorDesc(desc);
+  }
+  for (auto buffer : input_buffers_) {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
+  }
+  for (auto buffer : output_buffers_) {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer));
+  }
 }
 
 const std::string &NpuOpRunner::Type() { return op_type_; }
@@ -186,6 +200,8 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
 }
 
 NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
+  input_descs_.reserve(tensors.size());
+  input_buffers_.reserve(tensors.size());
   for (auto tensor : tensors) {
     // create aclTensorDesc
     input_descs_.emplace_back(CreateTensorDesc(tensor));
@@ -211,6 +227,8 @@ NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
 }
 
 NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
+  output_descs_.reserve(tensors.size());
+  output_buffers_.reserve(tensors.size());
   for (auto tensor : tensors) {
     // create aclTensorDesc
     output_descs_.emplace_back(CreateTensorDesc(tensor));
@@ -281,12 +299,12 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   return buffer;
 }
 
-void NpuOpRunner::Run(aclrtStream stream) {
+void NpuOpRunner::Run(aclrtStream stream) const {
   if (!stream) {
     VLOG(4) << "Run with default current npu stream: " << stream;
     stream = GetCurrentNPUStream();
   }
-
+  VLOG(5) << "NpuOpRunner(" << this << ") Run:";
   VLOG(4) << "op_type: " << op_type_;
   VLOG(4) << "input_desc.size: " << input_descs_.size();
   VLOG(4) << "output_desc.size: " << output_descs_.size();
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 79d77235b7c81b75d00336d7198e836c18eb3347..a637935c749629aea396ee114e89218612156510 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -41,6 +41,14 @@ class NpuOpRunner {
                        const std::vector<Tensor> &outputs = {},
                        const NPUAttributeMap &attrs = {});
 
+  // NOTE(zhiqiu): why forbid copy and operator= ?
+  // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
+  // if shallow copy is performed on tensor_descs and data_buffers, it may
+  // result
+  // in use-after-free bugs.
+  NpuOpRunner(const NpuOpRunner &runner) = delete;
+  NpuOpRunner &operator=(const NpuOpRunner &runner) = delete;
+
   ~NpuOpRunner();
 
   const std::string &Type();
@@ -71,7 +79,7 @@ class NpuOpRunner {
 
   std::vector<aclDataBuffer *> &GetOutputBuffers();
 
-  void Run(aclrtStream stream = nullptr);
+  void Run(aclrtStream stream = nullptr) const;
 
  private:
   aclTensorDesc *CreateTensorDesc(Tensor tensor);
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index e5fe7f20a42e0b869bdefe34a683b640c0a108f4..70fd546e5042c3ae96ec333c251e72396fef0e59 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -147,7 +147,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    auto runner =
+    const auto& runner =
         NpuOpRunner("ApplyAdamD",
                     {
                         *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr,
@@ -179,10 +179,10 @@ class AdamNPUKernel : public framework::OpKernel<T> {
     if (!use_global_beta_pow) {
       beta1_pow_out->mutable_data<T>(ctx.GetPlace());
       beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      auto runner_m1 =
+      const auto& runner_m1 =
           NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {});
       runner_m1.Run(stream);
-      auto runner_m2 =
+      const auto& runner_m2 =
           NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {});
       runner_m2.Run(stream);
     }
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
old mode 100755
new mode 100644
index 479f9643749d63c673158ad055409a0925f3d576..8f30dd5b2e68a4d15d849141b175b8eae503b170
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("LearningRate",
              "(LoDTensor, default LoDTensor<float>) "
              "Input learning rate");
+    AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
 
     AddOutput("ParamOut",
               "(LoDTensor) This output is updated parameter. "
@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("VelocityOut",
               "(LoDTensor) This output is updated velocity. "
               "It shared memory with Input(Velocity).");
+    AddOutput("MasterParamOut",
+              "The updated FP32 master weight for AMP. "
+              "It shared memory with Input(MasterParam).")
+        .AsDispensable();
 
     AddAttr<float>("mu", "(float) Momentum coefficient");
     AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("epsilon",
                    "(float, default 0.0) epsilon to avoid Division by Zero.")
         .SetDefault(0.0);
+    AddAttr<bool>("multi_precision",
+                  "(bool, default false) "
+                  "Whether to use multi-precision during weight updating.")
+        .SetDefault(false);
+    AddAttr<float>(
+        "rescale_grad",
+        "(float, default 1.0) Multiply the gradient with `rescale_grad`"
+        "before updating. Often choose to be `1.0/batch_size`.")
+        .SetDefault(1.0f);
 
     AddComment(R"DOC(
 Lars Momentum Optimizer.
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
index eb0111ae4de2f066359e26406f6c7ec3eb54d5fc..42477232e7ca1b23c53d88eecaa7e13c4197ecbd 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename T>
-__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
-                                   const T* learning_rate, const T mu,
-                                   const int64_t num, const T lars_coeff,
-                                   const T lars_weight_decay, const T* p_norm,
-                                   const T* g_norm, T* p_out, T* v_out,
-                                   const T epsilon) {
-  T lr = learning_rate[0];
-  T local_lr = learning_rate[0];
+using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
+
+template <typename T, typename MT>
+__global__ void MomentumLarsKernel(
+    const T* p, const T* g, const MT* v,
+    const MultiPrecisionType<T>* learning_rate, const MT mu, const int64_t num,
+    const MT lars_coeff, const MT lars_weight_decay,
+    const MultiPrecisionType<T>* p_norm, const MultiPrecisionType<T>* g_norm,
+    T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out,
+    const MultiPrecisionType<T> rescale_grad) {
+  const MT lr = static_cast<MT>(learning_rate[0]);
+  MT local_lr = lr;
+  const MT p_n = static_cast<MT>(p_norm[0]);
+  const MT g_n = static_cast<MT>(g_norm[0]);
+
+  if (lars_weight_decay > static_cast<MT>(0) && p_n > static_cast<MT>(0) &&
+      g_n > static_cast<MT>(0)) {
+    local_lr =
+        lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon);
+  }
   CUDA_KERNEL_LOOP(i, num) {
-    if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
-      local_lr = lr * lars_coeff * p_norm[0] /
-                 (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
-    }
+    MT grad = static_cast<MT>(g[i]) * static_cast<MT>(rescale_grad);
+    MT param = master_p ? master_p[i] : static_cast<MT>(p[i]);
+
+    MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param);
+    MT p_new = param - v_new;
 
-    T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
     v_out[i] = v_new;
-    p_out[i] = p[i] - v_new;
+    p_out[i] = static_cast<T>(p_new);
+    if (master_p_out) master_p_out[i] = p_new;
   }
 }
 
 template <typename DeviceContext, typename T>
 class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
+  using MPDType = MultiPrecisionType<T>;
+
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    if (multi_precision) {
+      InnerCompute<MPDType>(ctx, multi_precision);
+    } else {
+      InnerCompute<T>(ctx, multi_precision);
+    }
+  }
+
+ private:
+  template <typename MT>
+  void InnerCompute(const framework::ExecutionContext& ctx,
+                    const bool multi_precision) const {
     auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
     auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
     auto param = ctx.Input<framework::LoDTensor>("Param");
@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     auto grad = ctx.Input<framework::LoDTensor>("Grad");
     auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
 
+    const framework::Tensor* master_param = nullptr;
+    framework::Tensor* master_param_out = nullptr;
+    if (multi_precision) {
+      bool has_master =
+          ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+      PADDLE_ENFORCE_EQ(has_master, true,
+                        platform::errors::InvalidArgument(
+                            "The Input(MasterParam) and Output(MasterParamOut) "
+                            "should not be null when "
+                            "the attr `multi_precision` is true"));
+      master_param = ctx.Input<framework::Tensor>("MasterParam");
+      master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
+    }
+
+    const MT* master_p = multi_precision ? master_param->data<MT>() : nullptr;
+    MT* master_p_out = multi_precision
+                           ? master_param_out->mutable_data<MT>(ctx.GetPlace())
+                           : nullptr;
+
     T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
-    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+    MT* v_out = velocity_out->mutable_data<MT>(ctx.GetPlace());
 
-    T mu = static_cast<T>(ctx.Attr<float>("mu"));
-    T lars_coeff = ctx.Attr<float>("lars_coeff");
-    T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
-    T epsilon = ctx.Attr<float>("epsilon");
+    MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
+    MT lars_coeff = static_cast<MT>(ctx.Attr<float>("lars_coeff"));
+    MT lars_weight_decay =
+        static_cast<MT>(ctx.Attr<float>("lars_weight_decay"));
+    MT epsilon = static_cast<MT>(ctx.Attr<float>("epsilon"));
+    MPDType rescale_grad =
+        static_cast<MPDType>(ctx.Attr<float>("rescale_grad"));
 
     auto* p = param->data<T>();
-    auto* v = velocity->data<T>();
     auto* g = grad->data<T>();
-    auto* lr = learning_rate->data<T>();
+    auto* v = velocity->data<MT>();
+    auto* lr = learning_rate->data<MPDType>();
 
     int block = 512;
     int grid = (param->numel() + block - 1) / block;
@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
     framework::Tensor p_norm_t, g_norm_t;
     p_norm_t.Resize({1});
     g_norm_t.Resize({1});
-    auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
-    auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
-    auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
+    auto* p_norm_data = p_norm_t.mutable_data<MPDType>(ctx.GetPlace());
+    auto* g_norm_data = g_norm_t.mutable_data<MPDType>(ctx.GetPlace());
+    auto ep_norm = framework::EigenScalar<MPDType>::From(p_norm_t);
+    auto eg_norm = framework::EigenScalar<MPDType>::From(g_norm_t);
 
     auto* place = ctx.template device_context<DeviceContext>().eigen_device();
-    ep_norm.device(*place) = eigen_p.square().sum().sqrt();
-    eg_norm.device(*place) = eigen_g.square().sum().sqrt();
-    MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+
+    // eigen unsupport fp16 l2-norm
+    ep_norm.device(*place) =
+        eigen_p.template cast<MPDType>().square().sum().sqrt();
+    eg_norm.device(*place) =
+        (eigen_g.template cast<MPDType>() * rescale_grad).square().sum().sqrt();
+
+    MomentumLarsKernel<
+        T, MT><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
         p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
-        p_norm_data, g_norm_data, p_out, v_out, epsilon);
+        p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out,
+        rescale_grad);
   }
 };
 
@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
     lars_momentum,
     ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index cbb0704fa857b7021acf91ca2f606c3d88aa76a6..f461dec66c0e753cdf170a958f585fa609cd8dac 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
+    if (ctx->HasOutput("MasterParamOut")) {
+      ctx->SetOutputDim("MasterParamOut", param_dim);
+    }
   }
 
   framework::OpKernelType GetExpectedKernelType(
diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
index a8d19148ef520cc2b80b23e119e56f5a7b6f920f..446f578b79ff96171f39f8b0bfe3aede03190f5c 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
@@ -32,7 +32,7 @@ class SGDNPUKernel : public framework::OpKernel<T> {
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner =
+    const auto& runner =
         NpuOpRunner("ApplyGradientDescent",
                     {*param_var, *learning_rate, *grad_var}, {*param_out}, {});
 
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 95aaed4453517dd81fcfb277f46df8020be3ac11..087b8ecba6e1fb8b4a0ec44bf6b4dffd5b0e3fb5 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -246,3 +246,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::PadConstantLikeGradKernel<paddle::platform::CPUDeviceContext,
                                    int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_constant_like_grad,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
+                                   double>);
diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu
deleted file mode 100644
index 76faf30ed92000d7093eb73bf6499a43f6ab5b57..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/pad_constant_like_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadConstantLikeKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_constant_like_grad,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadConstantLikeGradKernel<paddle::platform::CUDADeviceContext,
-                                   double>);
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index 577f4f39411e290a88a91bafb61f7dafa7c1cb5f..3bf66c77badb90543e8351c3bca71418d47ff046 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -174,3 +174,16 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::PadGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::PadKernel<paddle::platform::CUDADeviceContext,
+                   paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PadGradKernel<paddle::platform::CUDADeviceContext,
+                       paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu
deleted file mode 100644
index 391e305352e55188fb0c502b8efe03af597d48ca..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/pad_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/pad_op.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::PadKernel<paddle::platform::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PadGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt
index 12168e61ba5a98fd18c08b2b97911a2e11c02eac..e4d654008d3d03f5136493bf3719636a6c7daf96 100644
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
@@ -27,7 +27,7 @@ register_operators(DEPS ${DISTRIBUTE_DEPS})
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE)
 
 set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op)
+cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op eigen_function)
 
 set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS})
+cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function)
diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc
index f91496eeab142071fcf87c929cf1327d9b53808d..c2f68675beb6214134cd0f73a2ef40f674e4d935 100644
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
@@ -199,9 +199,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext, int8_t>,
     ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::complex64>,
+                         ::paddle::platform::complex<float>>,
     ops::PyLayerOpKernel<paddle::platform::CPUDeviceContext,
-                         ::paddle::platform::complex128>);
+                         ::paddle::platform::complex<double>>);
 #ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
     py_layer, ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, float>,
@@ -218,7 +218,7 @@ REGISTER_OP_CUDA_KERNEL(
     ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int16_t>,
     ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext, int8_t>,
     ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::complex64>,
+                         ::paddle::platform::complex<float>>,
     ops::PyLayerOpKernel<paddle::platform::CUDADeviceContext,
-                         ::paddle::platform::complex128>);
+                         ::paddle::platform::complex<double>>);
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index ec9d1fde4533580f862e35d01fbdb6dd0143495a..01f5b4c73271291f0a0eec8f9ff59412700656ce 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -231,3 +231,10 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     rank_loss_grad,
     ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu
deleted file mode 100644
index ed805279892d0f045fdde94b30c9bc7b19348a9a..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/rank_loss_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/rank_loss_op.h"
-
-REGISTER_OP_CUDA_KERNEL(rank_loss,
-                        paddle::operators::RankLossKernel<
-                            paddle::platform::CUDADeviceContext, float>);
-REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
-                        paddle::operators::RankLossGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h
index 8609958476f60a0c03b399f8fa2a00b29f3a9011..3373c846ce2c4cade675637cd51e12181172e13b 100644
--- a/paddle/fluid/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,8 +37,8 @@ class RankLossKernel : public framework::OpKernel<T> {
     auto right = framework::EigenVector<T>::Flatten(*right_t);
 
     auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    out.device(dev) =
-        (1.0f + (left - right).exp()).log() - label * (left - right);
+    EigenRankLoss<std::decay_t<decltype(dev)>, T>::Eval(dev, out, label, left,
+                                                        right);
   }
 };
 
@@ -65,15 +66,15 @@ class RankLossGradKernel : public framework::OpKernel<T> {
     if (d_left_t) {
       d_left_t->mutable_data<T>(ctx.GetPlace());
       auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
-      d_left.device(dev) =
-          d_out * (1.0f / (1.0f + (right - left).exp()) - label);
+      EigenRankLossGrad<std::decay_t<decltype(dev)>, T>::EvalLeft(
+          dev, d_left, d_out, label, left, right);
     }
     // compute d_right
     if (d_right_t) {
       d_right_t->mutable_data<T>(ctx.GetPlace());
       auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
-      d_right.device(dev) =
-          -d_out * (1.0f / (1.0f + (right - left).exp()) - label);
+      EigenRankLossGrad<std::decay_t<decltype(dev)>, T>::EvalRight(
+          dev, d_right, d_out, label, left, right);
     }
   }
 };
diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h
index 29e46e091d06858378cb31a1005ec5687797e583..9e1aed5dde4b6532d0226ce17b4912b8ba7304c8 100644
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
@@ -366,33 +366,32 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
 #undef CUB_BLOCK_DIM_CASE
 }
 
-template <typename Tx, typename ReduceOp, typename TransformOp>
+template <typename Tx, typename ReduceOp,
+          template <typename, typename> class TransformOp>
 struct TensorReduceFunctor {
   const framework::Tensor& x;
   framework::Tensor* y;
   std::vector<int> origin_reduce_dims;
   const double& init;
   const ReduceOp& reducer;
-  const TransformOp& transformer;
   gpuStream_t stream;
   TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y,
                       std::vector<int> origin_reduce_dims, const double& init,
-                      const ReduceOp& reducer, const TransformOp& transformer,
-                      gpuStream_t stream)
+                      const ReduceOp& reducer, gpuStream_t stream)
       : x(x),
         y(y),
         origin_reduce_dims(origin_reduce_dims),
         init(init),
         reducer(reducer),
-        transformer(transformer),
         stream(stream) {}
 
   template <typename Ty>
 
   void apply() const {
     const Ty& init_cast = static_cast<Ty>(init);
-    TensorReduce<Tx, Ty, ReduceOp, TransformOp>(
-        x, y, origin_reduce_dims, init_cast, reducer, transformer, stream);
+    TensorReduce<Tx, Ty, ReduceOp, TransformOp<Tx, Ty>>(
+        x, y, origin_reduce_dims, init_cast, reducer, TransformOp<Tx, Ty>(),
+        stream);
   }
 };
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
index 39e74c908ae7ab5c420f07a559804d5aa5a9c216..e9d5c5f14c51f827353f54d1c84b50578ab7d41a 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
@@ -38,7 +38,7 @@ class ReduceAnyNPUKernel : public framework::OpKernel<T> {
     // set attr
     NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}};
 
-    auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
+    const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index 8c1d209bdb682222c35d807586c65fe49c2466e9..ba0e81991d75e7d4e10b8c21f45b22e337ac539f 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -121,9 +121,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
                       ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex64, ops::SumFunctor>,
+                      paddle::platform::complex<float>, ops::SumFunctor>,
     ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex128,
+                      paddle::platform::complex<double>,
 
                       ops::SumFunctor>);
 
@@ -132,10 +132,9 @@ using CPUReduceSumGradKernel =
     ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
                              ops::SumGradFunctor, true>;
 
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel<bool>,
-                       CPUReduceSumGradKernel<float>,
-                       CPUReduceSumGradKernel<double>,
-                       CPUReduceSumGradKernel<int>,
-                       CPUReduceSumGradKernel<int64_t>,
-                       CPUReduceSumGradKernel<paddle::platform::complex64>,
-                       CPUReduceSumGradKernel<paddle::platform::complex128>);
+REGISTER_OP_CPU_KERNEL(
+    reduce_sum_grad, CPUReduceSumGradKernel<bool>,
+    CPUReduceSumGradKernel<float>, CPUReduceSumGradKernel<double>,
+    CPUReduceSumGradKernel<int>, CPUReduceSumGradKernel<int64_t>,
+    CPUReduceSumGradKernel<paddle::platform::complex<float>>,
+    CPUReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
index dbd020514b2088a336184c4f1ca4f367dd3a14a3..dd16ca4e393e726bf72a5c9167c3a8164dca3539 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
@@ -18,11 +18,13 @@
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename Tx, typename Ty = Tx>
 struct IdentityFunctor {
   HOSTDEVICE explicit inline IdentityFunctor() {}
 
-  HOSTDEVICE inline T operator()(const T& x) const { return x; }
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(x);
+  }
 };
 
 template <typename T>
@@ -56,13 +58,13 @@ class ReduceSumKernel : public framework::OpKernel<T> {
     if (out_dtype >= 0) {
       framework::VisitDataTypeSmall(
           static_cast<framework::proto::VarType::Type>(out_dtype),
-          TensorReduceFunctor<T, cub::Sum, IdentityFunctor<T>>(
+          TensorReduceFunctor<T, cub::Sum, IdentityFunctor>(
               *input, output, reduce_dims, static_cast<double>(0.0), cub::Sum(),
-              IdentityFunctor<T>(), stream));
+              stream));
     } else {
-      TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+      TensorReduce<T, T, cub::Sum, IdentityFunctor<T, T>>(
           *input, output, reduce_dims, static_cast<T>(0), cub::Sum(),
-          IdentityFunctor<T>(), stream);
+          IdentityFunctor<T, T>(), stream);
     }
   }
 };
@@ -70,9 +72,9 @@ class ReduceSumKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel<bool>,
-                        ops::ReduceSumKernel<float>,
-                        ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
-                        ops::ReduceSumKernel<int64_t>,
-                        ops::ReduceSumKernel<paddle::platform::complex64>,
-                        ops::ReduceSumKernel<paddle::platform::complex128>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum, ops::ReduceSumKernel<bool>, ops::ReduceSumKernel<float>,
+    ops::ReduceSumKernel<double>, ops::ReduceSumKernel<int>,
+    ops::ReduceSumKernel<int64_t>,
+    ops::ReduceSumKernel<paddle::platform::complex<float>>,
+    ops::ReduceSumKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
index 67de8bb9a0c1ab4ae917b7e267fc2748087d900e..230bae0cdd4b1362329740783ee623d1809849d8 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -20,10 +20,9 @@ using CUDAReduceSumGradKernel =
     ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T,
                           ops::SumGradFunctor, true>;
 
-REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
-                        CUDAReduceSumGradKernel<float>,
-                        CUDAReduceSumGradKernel<double>,
-                        CUDAReduceSumGradKernel<int>,
-                        CUDAReduceSumGradKernel<int64_t>,
-                        CUDAReduceSumGradKernel<paddle::platform::complex64>,
-                        CUDAReduceSumGradKernel<paddle::platform::complex128>);
+REGISTER_OP_CUDA_KERNEL(
+    reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
+    CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
+    CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
+    CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
+    CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index f3b6e69a48bcb05563bc141e59863f95d6c17e30..78bd42ff00c83f409d1ec3d094ab8a03a2a68eb2 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -51,7 +51,7 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
       cast_x.Resize(x->dims());
       cast_x.mutable_data<float>(ctx.GetPlace());
       auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32);
-      auto runner_cast = NpuOpRunner(
+      const auto& runner_cast = NpuOpRunner(
           "Cast", {*x}, {cast_x}, {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast.Run(stream);
 
@@ -68,20 +68,22 @@ class ReduceSumNPUKernel : public framework::OpKernel<T> {
         dim_vec.push_back(i);
       }
 
-      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
-                                {{"axes", dim_vec}, {"keep_dims", keep_dims}});
+      const auto& runner =
+          NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                      {{"axes", dim_vec}, {"keep_dims", keep_dims}});
       runner.Run(stream);
 
     } else {
-      auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
-                                {{"axes", dims}, {"keep_dims", keep_dims}});
+      const auto& runner =
+          NpuOpRunner("ReduceSumD", {cast_x}, {cast_out},
+                      {{"axes", dims}, {"keep_dims", keep_dims}});
       runner.Run(stream);
     }
 
     if (x->type() != framework::proto::VarType::FP32 &&
         x->type() != framework::proto::VarType::FP16) {
       auto dst_dtype = ConvertToNpuDtype(out->type());
-      auto runner_cast =
+      const auto& runner_cast =
           NpuOpRunner("Cast", {cast_out}, {*out},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast.Run(stream);
@@ -107,8 +109,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
     if (keep_dims || reduce_all) {
-      auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
-                                {{"shape", framework::vectorize(x->dims())}});
+      const auto& runner =
+          NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad},
+                      {{"shape", framework::vectorize(x->dims())}});
       runner.Run(stream);
     } else {
       framework::DDim out_dims;
@@ -124,8 +127,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
           &out_grad_tmp);
       out_grad_tmp.Resize(out_dims);
 
-      auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
-                                {{"shape", framework::vectorize(x->dims())}});
+      const auto& runner =
+          NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
+                      {{"shape", framework::vectorize(x->dims())}});
       runner.Run(stream);
     }
   }
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index e119a21caa23cb937894031a3abec7c33b843615..717029cb8f11733ff03c54949554b91ed1ffe09c 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -613,23 +613,24 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(
     reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t,
     ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel,
     int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel,
-    paddle::platform::bfloat16, ops::ReshapeKernel, paddle::platform::complex64,
-    ops::ReshapeKernel, paddle::platform::complex128, ops::ReshapeKernel);
+    paddle::platform::bfloat16, ops::ReshapeKernel,
+    paddle::platform::complex<float>, ops::ReshapeKernel,
+    paddle::platform::complex<double>, ops::ReshapeKernel);
 
 REGISTER_OP_CPU_KERNEL_FUNCTOR(
     reshape2_grad, float, ops::ReshapeGradKernel, double,
     ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
     ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool,
     ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel,
-    paddle::platform::complex64, ops::ReshapeGradKernel,
-    paddle::platform::complex128, ops::ReshapeGradKernel);
+    paddle::platform::complex<float>, ops::ReshapeGradKernel,
+    paddle::platform::complex<double>, ops::ReshapeGradKernel);
 REGISTER_OP_CPU_KERNEL_FUNCTOR(
     reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
     ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
     ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool,
     ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex64,
-    ops::ReshapeDoubleGradKernel, paddle::platform::complex128,
+    ops::ReshapeDoubleGradKernel, paddle::platform::complex<float>,
+    ops::ReshapeDoubleGradKernel, paddle::platform::complex<double>,
     ops::ReshapeDoubleGradKernel);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -650,22 +651,23 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                 uint8_t, ops::ReshapeKernel, int64_t,
                                 ops::ReshapeKernel, plat::float16,
                                 ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                                plat::complex64, ops::ReshapeKernel,
-                                plat::complex128, ops::ReshapeKernel);
+                                plat::complex<float>, ops::ReshapeKernel,
+                                plat::complex<double>, ops::ReshapeKernel);
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(
     reshape2_grad, float, ops::ReshapeGradKernel, double,
     ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t,
     ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16,
-    ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex64,
-    ops::ReshapeGradKernel, plat::complex128, ops::ReshapeGradKernel);
+    ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex<float>,
+    ops::ReshapeGradKernel, plat::complex<double>, ops::ReshapeGradKernel);
 
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(
     reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double,
     ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t,
     ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel,
     plat::float16, ops::ReshapeDoubleGradKernel, bool,
-    ops::ReshapeDoubleGradKernel, plat::complex64, ops::ReshapeDoubleGradKernel,
-    plat::complex128, ops::ReshapeDoubleGradKernel);
+    ops::ReshapeDoubleGradKernel, plat::complex<float>,
+    ops::ReshapeDoubleGradKernel, plat::complex<double>,
+    ops::ReshapeDoubleGradKernel);
 #endif
 
 #ifdef PADDLE_WITH_XPU
@@ -673,14 +675,14 @@ REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int, ops::ReshapeKernel,
                                int64_t, ops::ReshapeKernel, plat::float16,
                                ops::ReshapeKernel, bool, ops::ReshapeKernel,
-                               plat::complex64, ops::ReshapeKernel,
-                               plat::complex128, ops::ReshapeKernel);
+                               plat::complex<float>, ops::ReshapeKernel,
+                               plat::complex<double>, ops::ReshapeKernel);
 REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel, plat::float16,
                                ops::ReshapeGradKernel, bool,
-                               ops::ReshapeGradKernel, plat::complex64,
-                               ops::ReshapeGradKernel, plat::complex128,
+                               ops::ReshapeGradKernel, plat::complex<float>,
+                               ops::ReshapeGradKernel, plat::complex<double>,
                                ops::ReshapeGradKernel);
 #endif
diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc
index 8b2b9f464b407ba27333e354854a70a233986853..98a1610be607e8bcd6d14a25a45d1856a64dbe8a 100644
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
@@ -145,4 +145,12 @@ REGISTER_OP_CPU_KERNEL(
     ops::ReverseKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::ReverseKernel<paddle::platform::CPUDeviceContext, bool>,
     ops::ReverseKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>)
+    ops::ReverseKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/operators/reverse_op.cu
deleted file mode 100644
index 635c41529b38f2dd287b00ed2e5659e11f619e78..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/reverse_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/reverse_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    reverse, ops::ReverseKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::ReverseKernel<paddle::platform::CUDADeviceContext, double>)
diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h
index 2813f7a4864a9ee84cefd8c824ee6f277b192dec..bf91e2f57a6676da7fca0a89564e59d99dd72981 100644
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
@@ -16,6 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,7 +24,7 @@ template <typename DeviceContext, typename T, int Rank>
 struct ReverseFunctor {
   void operator()(const DeviceContext& context, const framework::LoDTensor& in,
                   framework::LoDTensor* out, const std::vector<int>& axis) {
-    Eigen::array<bool, Rank> reverse_axis;
+    Eigen::DSizes<bool, Rank> reverse_axis;
     for (int i = 0; i < Rank; ++i) {
       reverse_axis[i] = false;
     }
@@ -37,9 +38,10 @@ struct ReverseFunctor {
 
     auto in_eigen = framework::EigenTensor<T, Rank>::From(in);
     auto out_eigen = framework::EigenTensor<T, Rank>::From(*out);
-    auto* dev = context.eigen_device();
+    auto& dev = *context.eigen_device();
 
-    out_eigen.device(*dev) = in_eigen.reverse(reverse_axis);
+    EigenReverse<std::decay_t<decltype(dev)>, T, Rank>::Eval(
+        dev, out_eigen, in_eigen, reverse_axis);
   }
 };
 
diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc
index 2d599716443901053aa3d5dc8e93759320175b24..69b2c5b73800738ed740cc59786c42222a1d9e35 100644
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
@@ -83,6 +83,13 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker {
               "contains at most one scope."
               "NOTE: Do not use Scope directly because Scope output is not "
               "currently supported.");
+    AddOutput("DOut",
+              "(vector<LoDTensor>)"
+              "The output tensors for GRAD Tensors in RunProgram forward "
+              "operator, the forward operator contains GRAD Tensors when it "
+              "computes double grad.")
+        .AsDuplicable()
+        .AsDispensable();
     AddAttr<BlockDesc*>("global_block",
                         "(BlockDesc *)"
                         "The global block of executed program desc.");
@@ -154,6 +161,7 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetInput("Params", this->Input("Params"));
     grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     grad_op->SetInput("OutScope", this->Output("OutScope"));
+    grad_op->SetInput("DOut", this->Output("DOut"));
     grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     grad_op->SetOutput(framework::GradVarName("Params"),
                        this->InputGrad("Params"));
diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h
index f78f5c5b948c63e02d9121c540b6207c30b2d0f9..c7aeb0e145e4cb704c56dabb2f090e63ecb280a7 100644
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
@@ -131,6 +131,9 @@ static void ShareVarsIntoScope(const std::vector<Variable *> &vars,
                                const std::vector<std::string> &var_names,
                                framework::Scope *scope) {
   for (size_t i = 0; i < vars.size(); ++i) {
+    if (var_names[i] == "Fake_var") {
+      continue;
+    }
     auto *var = scope->Var(var_names[i]);
     CheckInputVarStatus(*vars[i], var_names[i]);
     VariableShare(*vars[i], var);
@@ -141,9 +144,9 @@ static void ShareVarsFromScope(const std::vector<Variable *> &vars,
                                const std::vector<std::string> &var_names,
                                framework::Scope *scope) {
   for (size_t i = 0; i < vars.size(); ++i) {
-    if (var_names[i] == framework::kEmptyVarName) {
-      VLOG(2) << "find variable name is " << framework::kEmptyVarName
-              << ", skip it!";
+    if (var_names[i] == framework::kEmptyVarName ||
+        var_names[i] == "Fake_var") {
+      VLOG(2) << "find variable name is " << var_names[i] << ", skip it!";
       continue;
     }
     // NOTE: Here skip not found var is dangerous, if a bug is caused here,
@@ -170,9 +173,11 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     auto &input_vars = ctx.MultiInputVar("X");
     auto &param_vars = ctx.MultiInputVar("Params");
     auto output_vars = ctx.MultiOutputVar("Out");
+    auto dout_vars = ctx.MultiOutputVar("DOut");
 
     auto input_var_names = ctx.InputNames("X");
     auto output_var_names = ctx.OutputNames("Out");
+    auto dout_var_names = ctx.OutputNames("DOut");
 
     // current program may not hold parameters
     std::vector<std::string> param_names;
@@ -195,7 +200,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
     // Step 2. prepare executor and init persistable variables
     framework::Executor exe(ctx.GetPlace());
     auto exe_ctx = framework::GetExecutorInfoFromCache(
-        exe, ctx, {output_var_names}, /*is_grad=*/false);
+        exe, ctx, {output_var_names, dout_var_names}, /*is_grad=*/false);
 
     // NOTE(Aurelius84): While training some models, forward can be called many
     // times and then apply backpropagation all at once, such as Reinforcement
@@ -219,6 +224,7 @@ class RunProgramOpKernel : public framework::OpKernel<T> {
 
     // Step 4. Get Output
     details::ShareVarsFromScope(output_vars, output_var_names, &scope);
+    details::ShareVarsFromScope(dout_vars, dout_var_names, &scope);
 
     // Debug info: scope info when run end
     VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front());
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a71f49585bfcad0c03f17df647173fd946b4b7e2..a195452791048d9875602285551a00cf6e42c7a8 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace framework {
@@ -155,3 +156,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
     ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   uint8_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int16_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu
deleted file mode 100644
index e1f20a73b20fc23ec8b99ba0e5154eb184718ca3..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/scale_op.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   uint8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int16_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 11c81d23b2ed271ce89e6a27b1179e7d06dd0ebd..544f0a916681e6fe0042b0e7c3af537f5d464214 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -68,11 +69,8 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto eigen_out = framework::EigenVector<T>::Flatten(*out);
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
     auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    if (bias_after_scale) {
-      eigen_out.device(dev) = scale * eigen_in + bias;
-    } else {
-      eigen_out.device(dev) = scale * (eigen_in + bias);
-    }
+    EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+        dev, eigen_out, eigen_in, scale, bias, bias_after_scale);
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index cbfd11834ae47710bc8b80df15400689a50af6bc..6fb0e6d372745dc412a653e2fa27b398d1e16a5e 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -38,7 +38,7 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
             << " ,bias_after_scale:" << bias_after_scale;
     if (bias_after_scale) {
       out->mutable_data<T>(ctx.GetPlace());
-      auto runner =
+      const auto& runner =
           NpuOpRunner("Power", {*x}, {*out},
                       {{"power", _power}, {"scale", scale}, {"shift", bias}});
 
@@ -47,12 +47,13 @@ class ScaleNPUKernel : public framework::OpKernel<T> {
       Tensor tmp_x(x->type());
       tmp_x.Resize(x->dims());
       tmp_x.mutable_data<T>(ctx.GetPlace());
-      auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
+      const auto& runner_tmp =
+          NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
       runner_tmp.Run(stream);
 
       out->mutable_data<T>(ctx.GetPlace());
       float _bias = 0.0;
-      auto runner =
+      const auto& runner =
           NpuOpRunner("Power", {tmp_x}, {*out},
                       {{"power", _power}, {"scale", scale}, {"shift", _bias}});
       runner.Run(stream);
diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc
index e2e49acb94c7b22120acbd614c2f0ac139540f3c..d0183c6ed57c4dd59f51b8246772287024b8bf77 100644
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
@@ -53,11 +53,11 @@ class ScatterNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     if (overwrite) {
-      auto runner_update = NpuOpRunner("TensorScatterUpdate",
-                                       {*x, *index, *updates}, {*out}, {});
+      const auto& runner_update = NpuOpRunner(
+          "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {});
       runner_update.Run(stream);
     } else {
-      auto runner_add =
+      const auto& runner_add =
           NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {});
       runner_add.Run(stream);
     }
diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc
index c83726180baeae6f4b73adda3bd9d9127b0f3e26..f94fce66806eee82f2c3434161426a19aa9d916e 100644
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
@@ -54,4 +54,6 @@ TEST(scatter, ScatterUpdate) {
     EXPECT_EQ(output.data<float>()[i], static_cast<float>(i - 4));
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f);
   for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output.data<float>()[i], 0.0f);
+
+  delete cpu_place;
 }
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 3485b4e5c2fbebd83e8f5ee34437db35ce5f1f20..6207c33f9d6299605d24f11c13820eac47ee6c98 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/sign_op.h"
 #include <memory>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -69,3 +70,10 @@ REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
 REGISTER_OP_CPU_KERNEL(
     sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/sign_op.cu
deleted file mode 100644
index 817e0fbbd511462f161633242d28e63062676eb9..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/sign_op.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    sign,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b99934daee17e2b8a9295b488c0483e47187a009..b6d501afa621ac490be4ef3e567434779c61b0aa 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +32,8 @@ class SignKernel : public framework::OpKernel<T> {
     auto eigen_in = framework::EigenVector<T>::Flatten(*in);
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(place) = eigen_in.sign();
+    EigenSign<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
+                                                      eigen_in);
   }
 };
 
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
index 0a41424cfa11864879ff93d3807a3746a685b00d..b5298979721642065ae75bcf98bb8b44435038a3 100644
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -436,9 +436,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SliceKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex64>,
+                     paddle::platform::complex<float>>,
     ops::SliceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex128>);
+                     paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int>,
@@ -446,6 +446,31 @@ REGISTER_OP_CPU_KERNEL(
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::SliceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::float16>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::complex<float>>,
+    ops::SliceKernel<paddle::platform::CUDADeviceContext,
+                     paddle::platform::complex<double>>);
+
+REGISTER_OP_CUDA_KERNEL(
+    slice_grad,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::float16>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<float>>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
deleted file mode 100644
index 5f80d3cc971f5413b8cb6f64cfa860af9013fa2b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/slice_op.cu
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/slice_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    slice, ops::SliceKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::complex64>,
-    ops::SliceKernel<paddle::platform::CUDADeviceContext, plat::complex128>);
-
-REGISTER_OP_CUDA_KERNEL(
-    slice_grad,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, plat::complex64>,
-    ops::SliceGradKernel<paddle::platform::CUDADeviceContext,
-                         plat::complex128>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 22f6fa9e3e6f206b33c46369086d1637fdc83457..3d294ae238986c8cd7f18109871a559679553db0 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
 
@@ -238,8 +239,8 @@ class SliceKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(context.GetPlace());
 
     auto new_out_dims = out->dims();
-    auto offsets = Eigen::array<int64_t, D>();
-    auto extents = Eigen::array<int64_t, D>();
+    auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+    auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
     for (size_t i = 0; i < D; ++i) {
       offsets[i] = 0;
       extents[i] = new_out_dims[i];
@@ -268,10 +269,12 @@ class SliceKernel : public framework::OpKernel<T> {
         offsets_32bit[i] = offsets[i];
         extents_32bit[i] = extents[i];
       }
-      framework::To32BitIndex(out_t).device(place) =
-          framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit);
+      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
+          place, framework::To32BitIndex(out_t), framework::To32BitIndex(in_t),
+          offsets_32bit, extents_32bit);
     } else {
-      out_t.device(place) = in_t.slice(offsets, extents);
+      EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(place, out_t, in_t,
+                                                            offsets, extents);
     }
 
     out->Resize(out_dims);
@@ -624,10 +627,12 @@ class SliceGradKernel : public framework::OpKernel<T> {
         paddings_32bit[i] =
             std::make_pair(paddings[i].first, paddings[i].second);
       }
-      framework::To32BitIndex(d_in_t).device(place) =
-          framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0));
+      EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+          place, framework::To32BitIndex(d_in_t),
+          framework::To32BitIndex(d_out_t), paddings_32bit, static_cast<T>(0));
     } else {
-      d_in_t.device(place) = d_out_t.pad(paddings, T(0));
+      EigenPad<std::decay_t<decltype(place)>, T, D>::Eval(
+          place, d_in_t, d_out_t, paddings, static_cast<T>(0));
     }
   }
 };
diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc
index 9974536da9acb401a859c2c9f1d10d79eed680bb..8e0d4b4a019921661e1a3da78ed3f69ef20356ec 100644
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
@@ -72,8 +72,8 @@ class SliceNPUKernel : public framework::OpKernel<T> {
 
     UpdateAttr(in_dims, axes, starts, ends, &offsets, &size);
 
-    auto runner = NpuOpRunner("SliceD", {*input}, {*out},
-                              {{"offsets", offsets}, {"size", size}});
+    const auto& runner = NpuOpRunner("SliceD", {*input}, {*out},
+                                     {{"offsets", offsets}, {"size", size}});
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -111,7 +111,7 @@ class SliceGradNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    auto runner =
+    const auto& runner =
         NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}});
     runner.Run(stream);
   }
diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc
index 0e94f6af232f98e093953e1aee37306eb460211d..212b600fda1ae88588d6401e9407268a995ad752 100644
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
@@ -31,7 +31,7 @@ class SoftmaxNPUKernel : public framework::OpKernel<T> {
     auto* out = ctx.Output<framework::LoDTensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
-    auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
+    const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input);
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -71,8 +71,8 @@ class SoftmaxGradNPUKernel : public framework::OpKernel<T> {
     dX->mutable_data<T>(ctx.GetPlace());
 
     framework::NPUAttributeMap attr_input = {};
-    auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut},
-                              {*dX}, attr_input);
+    const auto& runner = NpuOpRunner(std::string("SoftmaxGrad"),
+                                     {tmp_out, tmp_dOut}, {*dX}, attr_input);
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 8fe456edeabf11742d96b48c08204a29c6028132..4aec4c174227921d6b396033d26550145dbd6bb2 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -15,481 +15,44 @@ limitations under the License. */
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
 #endif
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/softmax_impl.cuh"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/for_range.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/fluid/platform/miopen_helper.h"
-#else
-#include "paddle/fluid/platform/cudnn_helper.h"
-#endif
 
 namespace paddle {
 namespace operators {
 
-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
 using Tensor = framework::Tensor;
 
-// Wrapper of log function. Use log(float32) for float16
-template <typename T>
-static __device__ __forceinline__ T Log(T x) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-  AccT logx = std::log(static_cast<AccT>(x));
-  return math::TolerableValue<T>()(static_cast<T>(logx));
-}
-
-// Wrapper of exp function. Use exp(float32) for float16
+namespace {
 template <typename T>
-static __device__ __forceinline__ T Exp(T x) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-  AccT expx = std::exp(static_cast<AccT>(x));
-  return math::TolerableValue<T>()(static_cast<T>(expx));
-}
-
-// log2(value)
-static inline int Log2Ceil(int value) {
-  int log2_value = 0;
-  while ((1 << log2_value) < value) ++log2_value;
-  return log2_value;
-}
-
-enum class SoftmaxMode { kSoftmax, kLogSoftmax, kCrossEntropy };
-
-/*
-  Hard label cross entropy.
-*/
-template <typename T, bool IgnoreIndex>
-__global__ void CrossEntropyHardLabel(T* loss, const T* softmax,
-                                      const int64_t* labels, const int n,
-                                      const int dim, const int d,
-                                      const int ignore_idx) {
-  int64_t ids = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = ids / d;
-  int64_t idx_d = ids % d;
-
-  // thread ids compute loss[ids] using softmax[idx]
-  if (ids < n * d) {
-    int64_t idx = idx_n * dim * d + labels[ids] * d + idx_d;
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (labels[ids] == ignore_idx) {
-        loss[ids] = static_cast<T>(0.0);
-      } else {
-        loss[ids] = -Log(softmax[idx]);
-      }
-    } else {
-      // IgnoreIndex is false
-      loss[ids] = -Log(softmax[idx]);
-    }
-  }
-}
-
-/*
-  Hard label cross entropy with exp.
-  Input: log softmax
-  Output: loss and exp(input)
-*/
-template <typename T, bool IgnoreIndex>
-__global__ void CrossEntropyExpHardLabel(T* loss, T* softmax,
-                                         const int64_t* labels, const int n,
-                                         const int dim, const int d,
-                                         const int ignore_idx) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = idx / (d * dim);
-  int64_t idx_dim = (idx / d) % dim;
-  int64_t idx_d = idx % d;
-  int64_t ids = idx_n * d + idx_d;
-
-  if (idx < n * dim * d) {
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (idx_dim == labels[ids]) {
-        if (labels[ids] == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -softmax[idx];
-        }
-      }
-    } else {
-      // IgnoreIndex is false
-      if (labels[ids] >= 0 && labels[ids] < dim) {
-        if (labels[ids] == idx_dim) {
-          loss[ids] = -softmax[idx];
-        }
-      } else {
-        loss[ids] = static_cast<T>(0.0);
-      }
-    }
-    softmax[idx] = Exp(softmax[idx]);
-  }
-}
-
-/*
-  Core function of softmax with cross entropy forward
-    - softmax, SoftmaxMode=kSoftmax
-    - log softmax, SoftmaxMode=kLogSoftmax
-    - softmax with cross entropy hard label, SoftmaxMode=kCrossEntropy
-  The computation includes
-    - Compute max value: maxvalue_{i} = max_j src_{i,j}
-    - Compute sum of exp: s_{i} = sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
-    - Compute: softmax_{i,j} = e^{src_{i,j} - maxvalue_{i}} / s_{i}
-    - Compute: logsoftmax_{i,j} = src_{i,j} - maxvalue_{i} - log(s_{i})
-    - Compute: loss_{i} = -logsoftmax[i,label[i]] (Hard label)
-  This computation results from following formula:
-    softmax_{i,j} = e^{src_{i,j}} / sum_{j}{e^{src_{i,j}}}
-                  = e^{src_{i,j} - maxvalue_{i}}
-                    / sum_{j}{e^{src_{i,j} - maxvalue_{i}}}
-                  = e^{src_{i,j} - maxvalue_{i}} / s_{i}
-    logsoftmax_{i,j} = log(softmax_{i,j})
-                     = src_{i,j} - maxvalue_{i} - log(s_{i})
-  One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
-  For reduction max (sum), firstly compute max (sum) to one warp, then use
-  shuffle api to compute max (sum) in one warp.
-*/
-template <typename T, typename VecT, typename AccT, int Log2Elements,
-          SoftmaxMode mode, bool IgnoreIndex>
-__global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src,
-                                   const int64_t* label, const int batch_size,
-                                   const int stride, const int element_count,
-                                   const int ignore_index) {
-  constexpr int kDimCeil = 1 << Log2Elements;
-  constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-  constexpr int kVSize = sizeof(VecT) / sizeof(T);
-  constexpr int kIterations = kDimCeil / kWarpSize;
-  constexpr int kIterationsV =
-      (kIterations >= kVSize) ? (kIterations / kVSize) : 1;
-  constexpr int kBatchSize = (kDimCeil <= 128) ? 2 : 1;
-
-  int first_batch = (blockDim.y * blockIdx.x + threadIdx.y) * kBatchSize;
-
-  // max index to read
-  int idx_max_v[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; i++) {
-    int idx_max = ((i + first_batch) < batch_size) ? element_count : 0;
-    idx_max_v[i] = idx_max / kVSize;
-  }
-
-  // read data from global memory
-  AccT srcdata[kBatchSize][kIterationsV][kVSize];
-
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-// read data to srcdata: - KVSize==1, - KVSize>1
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int src_idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {
-        if (src_idx < idx_max_v[i]) {
-          srcdata[i][it][0] =
-              static_cast<AccT>(src[(first_batch + i) * stride + src_idx]);
-        } else {
-          srcdata[i][it][0] = -std::numeric_limits<AccT>::infinity();
-        }
-      } else {
-        const VecT* src_v =
-            reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
-        if (src_idx < idx_max_v[i]) {
-          VecT srctmp = src_v[src_idx];
-          const T* srcinptr = reinterpret_cast<const T*>(&srctmp);
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = static_cast<AccT>(srcinptr[s]);
-          }
-        } else {
-#pragma unroll
-          for (int s = 0; s < kVSize; s++) {
-            srcdata[i][it][s] = -std::numeric_limits<AccT>::infinity();
-          }
-        }
-      }
-    }
-  }
-
-  // compute max value: maxvalue_{i} = max_j src_{i,j}
-  AccT max_value[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    AccT valmax = srcdata[i][0][0];
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      valmax = (valmax > srcdata[i][0][s]) ? valmax : srcdata[i][0][s];
-    }
-    max_value[i] = valmax;
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-      AccT valmax = srcdata[i][it][0];
-#pragma unroll
-      for (int s = 1; s < kVSize; ++s) {
-        valmax = (valmax > srcdata[i][it][s]) ? valmax : srcdata[i][it][s];
-      }
-      max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax;
-    }
-  }
-  WarpReduceMax<AccT, kBatchSize, kWarpSize>(max_value);
-
-  // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} }
-  AccT sum[kBatchSize];
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    // it = 0
-    if (mode == SoftmaxMode::kLogSoftmax ||
-        mode == SoftmaxMode::kCrossEntropy) {
-      sum[i] = std::exp(srcdata[i][0][0] - max_value[i]);
-    } else {
-      srcdata[i][0][0] = std::exp(srcdata[i][0][0] - max_value[i]);
-      sum[i] = srcdata[i][0][0];
-    }
-#pragma unroll
-    for (int s = 1; s < kVSize; ++s) {
-      if (mode == SoftmaxMode::kLogSoftmax ||
-          mode == SoftmaxMode::kCrossEntropy) {
-        sum[i] += std::exp(srcdata[i][0][s] - max_value[i]);
-      } else {
-        srcdata[i][0][s] = std::exp(srcdata[i][0][s] - max_value[i]);
-        sum[i] += srcdata[i][0][s];
-      }
-    }
-
-// it = 1, 2, ...
-#pragma unroll
-    for (int it = 1; it < kIterationsV; ++it) {
-#pragma unroll
-      for (int s = 0; s < kVSize; ++s) {
-        if (mode == SoftmaxMode::kLogSoftmax ||
-            mode == SoftmaxMode::kCrossEntropy) {
-          sum[i] += std::exp(srcdata[i][it][s] - max_value[i]);
-        } else {
-          srcdata[i][it][s] = std::exp(srcdata[i][it][s] - max_value[i]);
-          sum[i] += srcdata[i][it][s];
-        }
-      }
-    }
-  }
-  WarpReduceSum<AccT, kBatchSize, kWarpSize>(sum);
-
-// write data
-#pragma unroll
-  for (int i = 0; i < kBatchSize; ++i) {
-    if (mode == SoftmaxMode::kLogSoftmax ||
-        mode == SoftmaxMode::kCrossEntropy) {
-      sum[i] = std::log(sum[i]);
-    }
-
-#pragma unroll
-    for (int it = 0; it < kIterationsV; ++it) {
-      int idx = threadIdx.x + it * kWarpSize;
-      if (kVSize == 1) {  // kVSize==1
-        if (idx < idx_max_v[i]) {
-          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] - max_value[i] - sum[i];
-            // softmax with cross entropy hard label
-          } else if (mode == SoftmaxMode::kCrossEntropy) {
-            AccT logsoftmax = srcdata[i][it][0] - max_value[i] - sum[i];
-            // softmax
-            softmax[(first_batch + i) * stride + idx] = std::exp(logsoftmax);
-            // label
-            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (label[first_batch + i] == loss_idx) {
-                if (label[first_batch + i] != ignore_index) {
-                  loss[first_batch + i] = -logsoftmax;
-                } else {
-                  loss[first_batch + i] = static_cast<T>(0.0);
-                }
-              }
-            } else {
-              // IgnoreIndex is false
-              if (label[first_batch + i] >= 0 &&
-                  label[first_batch + i] < element_count) {
-                if (label[first_batch + i] == loss_idx) {
-                  loss[first_batch + i] = -logsoftmax;
-                }
-              } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
-              }
-            }
-          } else {  // softmax
-            softmax[(first_batch + i) * stride + idx] =
-                srcdata[i][it][0] / sum[i];
-          }
-        } else {
-          break;
-        }
-      } else {  // KVSize>1
-        VecT* softmax_v =
-            reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
-        VecT tmpdata;
-        T* tmpptr = reinterpret_cast<T*>(&tmpdata);
-#pragma unroll
-        for (int s = 0; s < kVSize; ++s) {
-          if (mode == SoftmaxMode::kLogSoftmax) {  // log softmax
-            tmpptr[s] = srcdata[i][it][s] - max_value[i] - sum[i];
-            // softmax with cross entropy hard label
-          } else if (mode == SoftmaxMode::kCrossEntropy) {
-            AccT logsoftmax = srcdata[i][it][s] - max_value[i] - sum[i];
-            // softmax
-            tmpptr[s] = std::exp(logsoftmax);
-            // label
-            int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (label[first_batch + i] == loss_idx &&
-                  label[first_batch + i] != ignore_index) {
-                loss[first_batch + i] = -logsoftmax;
-              }
-            } else {
-              // IgnoreIndex is false
-              if (label[first_batch + i] >= 0 &&
-                  label[first_batch + i] < element_count) {
-                if (label[first_batch + i] == loss_idx) {
-                  loss[first_batch + i] = -logsoftmax;
-                }
-              } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
-              }
-            }
-          } else {  // softmax
-            tmpptr[s] = srcdata[i][it][s] / sum[i];
-          }
-        }
-        if (idx < idx_max_v[i]) {
-          softmax_v[idx] = tmpdata;
-        } else {
-          break;
-        }
-      }
+__global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
+                                 const int64_t n, const int64_t d,
+                                 const int64_t remain, const int ignore_index) {
+  CUDA_KERNEL_LOOP_TYPE(index, n * remain, int64_t) {
+    int64_t idx_n = index / remain;
+    int64_t idx_remain = index % remain;
+    int64_t tmp = labels[index];
+    if (ignore_index != tmp) {
+      int64_t idx = idx_n * d + tmp * remain + idx_remain;
+      logit_grad[idx] -= static_cast<T>(1.);
     }
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, VecT, AccT)           \
-  case Log2Elements:                                                  \
-    WarpSoftmaxForward<T, VecT, AccT, Log2Elements, mode,             \
-                       IgnoreIndex><<<blocks, threads, 0, stream>>>(  \
-        loss, softmax, src, label, batch_size, stride, element_count, \
-        ignore_index);                                                \
-    break;
-
-/*
-  Wrapper of softmax with cross entropy forward hard label.
-*/
-template <typename T, SoftmaxMode mode, bool IgnoreIndex>
-void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src,
-                              const int64_t* label, const int batch_size,
-                              const int stride, const int element_count,
-                              const int ignore_index, gpuStream_t stream) {
-  using AccT = typename details::MPTypeTrait<T>::Type;
-
-  // use 128 threads per block to maximimize gpu utilization
-  const int Log2Elements = static_cast<int>(Log2Ceil(element_count));
-  const int kDimCeil = 1 << Log2Elements;
-  int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32;
-  int batches_per_warp = (kDimCeil <= 128) ? 2 : 1;
-  constexpr int threads_per_block = 128;
-  int warps_per_block = (threads_per_block / kWarpSize);
-  int batches_per_block = warps_per_block * batches_per_warp;
-  int blocks = (batch_size + batches_per_block - 1) / batches_per_block;
-  dim3 threads(kWarpSize, warps_per_block, 1);
-
-  switch (Log2Elements) {
-    SOFTMAX_WARP_FORWARD_CASE(0, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(1, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(2, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(3, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(4, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(5, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(6, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(7, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(8, T, AccT);
-    SOFTMAX_WARP_FORWARD_CASE(9, T, AccT);
-    default:
-      break;
-  }
-}
-
-/*
-  Wrapper of softmax with cross entropy hard label.
-  - SwitchWarpSoftmaxForward for small size
-  - cudnn function for large size
-*/
-template <typename T, bool IgnoreIndex>
-static void SoftmaxWithCrossEntropyHardLabel(
-    const platform::CUDADeviceContext& ctx, int rank, int axis,
-    const T* logits_data, const int64_t* labels_data, T* loss_data,
-    T* softmax_data, int N, int dim, int D, const int ignore_index) {
-  auto stream = ctx.stream();
-  constexpr int max_dim = 320;
-  if (D == 1 && dim <= max_dim) {  // small size
-    const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-    SwitchWarpSoftmaxForward<T, mode, IgnoreIndex>(
-        loss_data, softmax_data, logits_data, labels_data, N, dim, dim,
-        ignore_index, stream);
-  } else {
-    ScopedTensorDescriptor desc;
-    std::vector<int> tensor_dims = {N, dim, D, 1};
-    DataLayout layout = DataLayout::kNCHW;
-#ifdef PADDLE_WITH_HIP
-    miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#else
-    cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#endif
-
-    auto handle = ctx.cudnn_handle();
-
-#ifdef PADDLE_WITH_HIP
-    auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE
-                                 : MIOPEN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-        handle, platform::CudnnDataType<T>::kOne(), descp, logits_data,
-        platform::CudnnDataType<T>::kZero(), descp, softmax_data,
-        MIOPEN_SOFTMAX_LOG, mode));
-#else
-    auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
-                                 : CUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-        handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType<T>::kOne(),
-        descp, logits_data, platform::CudnnDataType<T>::kZero(), descp,
-        softmax_data));
-#endif
-    int threads = 128;
-    int blocks = (N * dim * D + threads - 1) / threads;
-    // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T, IgnoreIndex><<<blocks, threads, 0, stream>>>(
-        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
-  }
-}
-
-/*
-  Wrapper of softmax with cross entropy grad hard label.
-*/
 template <typename T>
-__global__ void SoftmaxWithCrossEntropyGradHardLabel(
-    T* logits_grad, const T* loss_grad, const int64_t* labels, const int64_t n,
-    const int64_t dim, const int64_t d, const int ignore_index) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t idx_n = idx / (d * dim);
-  int64_t idx_dim = (idx / d) % dim;
-  int64_t idx_d = idx % d;
-  int64_t ids = idx_n * d + idx_d;
-
-  if (idx < n * dim * d) {
-    if (labels[ids] == ignore_index) {
-      logits_grad[idx] = static_cast<T>(0.0);
-    } else if (labels[ids] == idx_dim) {
-      logits_grad[idx] =
-          (logits_grad[idx] - static_cast<T>(1.0)) * loss_grad[ids];
+__global__ void Scale(T* logit_grad, const T* loss_grad, const int64_t num,
+                      const int64_t d, const int64_t remain,
+                      const int64_t* labels, const int ignore_index) {
+  CUDA_KERNEL_LOOP_TYPE(index, num, int64_t) {
+    int64_t idx_n = index / d;
+    int64_t idx_remain = index % remain;
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    if (labels[idx_lbl] == ignore_index) {
+      logit_grad[index] = static_cast<T>(0.);
     } else {
-      logits_grad[idx] *= loss_grad[ids];
+      logit_grad[index] *= loss_grad[idx_lbl];
     }
   }
 }
@@ -560,6 +123,8 @@ __global__ void ScaleCrossEntropyGradient(T* logit_grad, const T* loss_grad,
   }
 }
 
+}  // namespace
+
 static __device__ __forceinline__ platform::float16 exp_on_device(
     platform::float16 x) {
   return ::Eigen::numext::exp(x);
@@ -831,6 +396,278 @@ static __global__ void RowReductionForCrossEntropy(const T* logits_data,
   if (threadIdx.x == 0) loss_data[blockIdx.x] = loss;
 }
 
+template <typename T>
+struct HardLabelCrossEntropyFunctor {
+ public:
+  HardLabelCrossEntropyFunctor(const int64_t* labels, T* loss,
+                               const T* logits_data, int d, int axis_dim)
+      : labels_(labels),
+        loss_(loss),
+        logits_data_(logits_data),
+        d_(d),
+        axis_dim_(axis_dim) {}
+
+  __device__ void operator()(int idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int remain = d_ / axis_dim_;
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+    // It also would ignore labels not in range(class_num).
+    if (idx_axis != labels_[idx_lbl]) {
+    } else {
+      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  const T* logits_data_;
+  int d_;
+  int axis_dim_;
+};
+
+template <typename T>
+struct HardLabelCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels, T* loss,
+                                            const T* logits_data, int d,
+                                            int axis_dim, int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        logits_data_(logits_data),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int remain = d_ / axis_dim_;
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+
+    if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) {
+      loss_[idx_lbl] = -log_on_device(logits_data_[idx]);
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  const T* logits_data_;
+  int d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static void HardLabelCrossEntropy(const platform::CUDADeviceContext& ctx,
+                                  const T* logits_data,
+                                  const int64_t* labels_data, T* loss_data,
+                                  int n, int d, int axis_dim, int ignore_idx) {
+  constexpr int kMaxBlockDim = 512;
+  int block_dim = axis_dim >= kMaxBlockDim
+                      ? kMaxBlockDim
+                      : (1 << static_cast<int>(std::log2(axis_dim)));
+  int grid_dim = n * d / axis_dim;
+  auto stream = ctx.stream();
+
+#define CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
+  case BlockDim: {                                                          \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);   \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                         \
+      for_range(HardLabelCrossEntropyFunctorWithIgnoreIdx<T>(               \
+          labels_data, loss_data, logits_data, d, axis_dim, ignore_idx));   \
+    } else {                                                                \
+      for_range(HardLabelCrossEntropyFunctor<T>(labels_data, loss_data,     \
+                                                logits_data, d, axis_dim)); \
+    }                                                                       \
+  } break
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_CROSS_ENTROPY_FUSED_KERNEL(2);
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctor {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
+                                          T* log_softmax, int64_t d,
+                                          int axis_dim, int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int64_t idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int64_t remain = d_ / axis_dim_;
+    int64_t idx_n = idx / d_;
+    int64_t idx_axis = (idx % d_) / remain;
+    int64_t idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    PADDLE_ENFORCE(labels_[idx_lbl] >= 0 && labels_[idx_lbl] < d_ ||
+                       labels_[idx_lbl] == ignore_idx_,
+                   "The value of label[%ld] expected >= 0 and < %ld, or == %d,"
+                   "but got %ld. Please check input value.",
+                   idx_lbl, d_, ignore_idx_, labels_[idx_lbl]);
+    // It also would ignore labels not in range(class_num).
+    if (idx_axis != labels_[idx_lbl]) {
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = exp_on_device(softmax);
+      loss_[idx_lbl] = -softmax;
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int64_t d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
+ public:
+  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
+                                                       T* loss, T* log_softmax,
+                                                       int64_t d, int axis_dim,
+                                                       int ignore_idx)
+      : labels_(labels),
+        loss_(loss),
+        log_softmax_(log_softmax),
+        d_(d),
+        axis_dim_(axis_dim),
+        ignore_idx_(ignore_idx) {}
+
+  __device__ void operator()(int64_t idx) const {
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
+    int64_t remain = d_ / axis_dim_;
+    int64_t idx_n = idx / d_;
+    int64_t idx_axis = (idx % d_) / remain;
+    int64_t idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int64_t idx_lbl = idx_n * remain + idx_remain;
+    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
+      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
+    } else {
+      auto softmax = log_softmax_[idx];
+      log_softmax_[idx] = exp_on_device(softmax);
+      loss_[idx_lbl] = -softmax;
+    }
+  }
+
+ private:
+  const int64_t* labels_;
+  T* loss_;
+  T* log_softmax_;
+  int64_t d_;
+  int axis_dim_;
+  int ignore_idx_;
+};
+
+template <typename T>
+static void HardLabelSoftmaxWithCrossEntropy(
+    const platform::CUDADeviceContext& ctx, const T* logits_data,
+    const int64_t* labels_data, T* loss_data, T* softmax_data, int64_t n,
+    int64_t d, int axis_dim, int ignore_idx) {
+#ifdef __HIPCC__
+  // HIP platform will have loss nan if dim size > 256
+  constexpr int kMaxBlockDim = 256;
+#else
+  constexpr int kMaxBlockDim = 512;
+#endif
+  int64_t block_dim = axis_dim >= kMaxBlockDim
+                          ? kMaxBlockDim
+                          : (1 << static_cast<int>(std::log2(axis_dim)));
+  int64_t grid_dim = n * d / axis_dim;
+  auto stream = ctx.stream();
+
+#ifdef __HIPCC__
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)      \
+  case BlockDim: {                                                             \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForMax<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, d, axis_dim);                                \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForSum<T, BlockDim>),       \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(RowReductionForDiff<T, BlockDim>),      \
+                       dim3(grid_dim), dim3(BlockDim), 0, stream, logits_data, \
+                       loss_data, softmax_data, d, axis_dim);                  \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);      \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                            \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(       \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    } else {                                                                   \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                    \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx));     \
+    }                                                                          \
+  } break
+#else
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
+  case BlockDim: {                                                         \
+    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, d, axis_dim);                              \
+    RowReductionForDiffMaxSum<T, BlockDim,                                 \
+                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
+        logits_data, loss_data, softmax_data, d, axis_dim);                \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
+    } else {                                                               \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
+    }                                                                      \
+  } break
+#endif
+
+  switch (block_dim) {
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(512);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(256);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(128);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(64);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(32);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(16);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(8);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
+    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Block Dimension must be 2^n in softmax_with_cross_entropy_op."));
+      break;
+  }
+#undef CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL
+}
+
 template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(
     const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data,
@@ -946,7 +783,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
       const int rank = softmax->dims().size();
       const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-      const int axis_dim = softmax->dims()[axis];
+      int axis_dim = softmax->dims()[axis];
 
       const int n = SizeToAxis(axis, softmax->dims());
       const int d = SizeFromAxis(axis, softmax->dims());
@@ -989,19 +826,9 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {  // HardLabel
         auto* logits_data = softmax->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        int threads = 128;
-        int blocks = (n * d / axis_dim + threads - 1) / threads;
-        if (ignore_index >= 0 && ignore_index < axis_dim) {
-          CrossEntropyHardLabel<T, true><<<
-              blocks, threads, 0, context.cuda_device_context().stream()>>>(
-              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        } else {
-          CrossEntropyHardLabel<T, false><<<
-              blocks, threads, 0, context.cuda_device_context().stream()>>>(
-              loss_data, logits_data, labels_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        }
+        HardLabelCrossEntropy<T>(context.cuda_device_context(), logits_data,
+                                 labels_data, loss_data, n, d, axis_dim,
+                                 ignore_index);
       }
 
       // cause of input is softmax
@@ -1059,17 +886,9 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
       } else {
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
-        if (ignore_index >= 0 && ignore_index < axis_dim) {
-          SoftmaxWithCrossEntropyHardLabel<T, true>(
-              context.cuda_device_context(), rank, axis, logits_data,
-              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        } else {
-          SoftmaxWithCrossEntropyHardLabel<T, false>(
-              context.cuda_device_context(), rank, axis, logits_data,
-              labels_data, loss_data, softmax_data, n, axis_dim, d / axis_dim,
-              ignore_index);
-        }
+        HardLabelSoftmaxWithCrossEntropy<T>(
+            context.cuda_device_context(), logits_data, labels_data, loss_data,
+            softmax_data, n, d, axis_dim, ignore_index);
       }
     }
   }
@@ -1140,11 +959,14 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
           logit_grad_data, loss_grad_data, label_data, n, d, remain);
     } else {
+      int64_t grid = (n * remain + block - 1) / block;
       const int64_t* label_data = labels->data<int64_t>();
-      int grid = (n * d + block - 1) / block;
-      SoftmaxWithCrossEntropyGradHardLabel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, n, d / remain, remain,
-          ignore_index);
+      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
+          logit_grad_data, label_data, n, d, remain, ignore_index);
+      int64_t num = n * d;
+      grid = (num + block - 1) / block;
+      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
+                                           d, remain, label_data, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
index a34946315f5a81d04956735ce5b89b72761a6d0f..9921248d1ca1d652cd7505a50b7a2ec4c46afc9e 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -47,7 +47,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
 
     // softmax
     softmax->mutable_data<T>(ctx.GetPlace());
-    auto runner_softmax =
+    const auto& runner_softmax =
         NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}});
     runner_softmax.Run(stream);
 
@@ -57,7 +57,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
       tmp_labels.Resize(labels->dims());
       tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
       auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-      auto runner_cast_label =
+      const auto& runner_cast_label =
           NpuOpRunner("Cast", {*labels}, {tmp_labels},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_label.Run(stream);
@@ -77,7 +77,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     tmp_onehot.Resize(logits->dims());
     tmp_onehot.mutable_data<int>(ctx.GetPlace());
 
-    auto runner_onehot =
+    const auto& runner_onehot =
         NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
                     {{"axis", -1}, {"depth", cls_num}});
     runner_onehot.Run(stream);
@@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     cast_onehot.Resize(tmp_onehot.dims());
     cast_onehot.mutable_data<T>(ctx.GetPlace());
     auto dst_dtype = ConvertToNpuDtype(logits->type());
-    auto runner_cast_onehot =
+    const auto& runner_cast_onehot =
         NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     runner_cast_onehot.Run(stream);
@@ -102,8 +102,9 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel<T> {
     // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size]
     auto loss_dims = loss->dims();
     loss->Resize({loss_dims[0]});
-    auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits",
-                                {*logits, cast_onehot}, {*loss, backprop}, {});
+    const auto& runner_s =
+        NpuOpRunner("SoftmaxCrossEntropyWithLogits", {*logits, cast_onehot},
+                    {*loss, backprop}, {});
     runner_s.Run(stream);
     loss->Resize(loss_dims);
   }
@@ -130,7 +131,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
       tmp_labels.Resize(labels->dims());
       tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32);
       auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32);
-      auto runner_cast_label =
+      const auto& runner_cast_label =
           NpuOpRunner("Cast", {*labels}, {tmp_labels},
                       {{"dst_type", static_cast<int>(dst_dtype)}});
       runner_cast_label.Run(stream);
@@ -150,7 +151,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     tmp_onehot.Resize(softmax->dims());
     tmp_onehot.mutable_data<int>(ctx.GetPlace());
 
-    auto runner_onehot =
+    const auto& runner_onehot =
         NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot},
                     {{"axis", -1}, {"depth", cls_num}});
     runner_onehot.Run(stream);
@@ -160,7 +161,7 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     cast_onehot.Resize(tmp_onehot.dims());
     cast_onehot.mutable_data<T>(ctx.GetPlace());
     auto dst_dtype = ConvertToNpuDtype(softmax->type());
-    auto runner_cast_onehot =
+    const auto& runner_cast_onehot =
         NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot},
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     runner_cast_onehot.Run(stream);
@@ -169,13 +170,13 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_sub(softmax->type());
     tmp_sub.Resize(softmax->dims());
     tmp_sub.mutable_data<T>(ctx.GetPlace());
-    auto runner_sub =
+    const auto& runner_sub =
         NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {});
 
     runner_sub.Run(stream);
     // mul
     logits_grad->mutable_data<T>(ctx.GetPlace());
-    auto runner_mul =
+    const auto& runner_mul =
         NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {});
     runner_mul.Run(stream);
   }
diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc
index 958655b1f27c680655c20e8f795fc9e4bf37251d..a7e18e9c0c31b1a9c2254fe55c7e24adefde4bf4 100644
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
@@ -69,7 +69,7 @@ class StackNPUKernel : public framework::OpKernel<T> {
       tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
       tmp_stack.mutable_data<T>(ctx.GetPlace());
 
-      auto runner =
+      const auto& runner =
           NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
       runner.Run(stream);
 
@@ -81,12 +81,12 @@ class StackNPUKernel : public framework::OpKernel<T> {
         }
       }
 
-      auto runner_trans_final =
+      const auto& runner_trans_final =
           NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
       runner_trans_final.Run(stream);
 
     } else {
-      auto runner =
+      const auto& runner =
           NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
       runner.Run(stream);
     }
diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index e49476e4dc7d4a0eb5d4bb996e935b30dafd55d0..d71be60e1f5c22dc0a43d1d035044c1b96c86c41 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -329,9 +329,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, float>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext, double>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex64>,
+                            paddle::platform::complex<float>>,
     ops::StridedSliceKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex128>);
+                            paddle::platform::complex<double>>);
 
 REGISTER_OP_CPU_KERNEL(
     strided_slice_grad,
@@ -340,6 +340,6 @@ REGISTER_OP_CPU_KERNEL(
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext,
-                                paddle::platform::complex64>,
+                                paddle::platform::complex<float>>,
     ops::StridedSliceGradKernel<paddle::platform::CPUDeviceContext,
-                                paddle::platform::complex128>);
+                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu
index b85403b1c5bb886a1a08f084e899c7f27ab5e963..68a8312f0818d418a820a742a9c4b832b5a8eb5b 100644
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/strided_slice_op.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
@@ -24,9 +23,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, float>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext, double>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex64>,
+                            paddle::platform::complex<float>>,
     ops::StridedSliceKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex128>);
+                            paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     strided_slice_grad,
@@ -35,6 +34,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex64>,
+                                paddle::platform::complex<float>>,
     ops::StridedSliceGradKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex128>);
+                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index e3dc5faf46c81e71173c6f5a6ad7766067cad1c3..a1550bde696626a2fca57a4e873c624faa3185f7 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -43,12 +43,12 @@ class SumNPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
+    const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
 
     runner.Run(stream);
     for (int i = 2; i < n; i++) {
-      runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
-      runner.Run(stream);
+      const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
+      runner1.Run(stream);
     }
   }
 };
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index a7d7ea260ecdf44ab94e65f28db1294f7c57c527..07749f90ebaa29c3f618a5850ad2d72942035e95 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 #endif
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/float16.h"
@@ -563,15 +564,19 @@ bool SortTopk(const platform::CUDADeviceContext& ctx,
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
     auto e_indices =
         framework::EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(temp_indices);
+    auto e_tmp_indices = framework::EigenMatrix<int64_t>::From(
+        static_cast<const Tensor>(temp_indices));
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
     auto dim = framework::make_ddim(odims);
     auto e_values = framework::EigenMatrix<T>::From(*out_tensor, dim);
-    auto e_tmp_values = framework::EigenMatrix<T>::From(temp_values);
+    auto e_tmp_values =
+        framework::EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));
 
-    e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes);
-    e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
+        dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
+    EigenSlice<std::decay_t<decltype(dev)>, T, 2>::Eval(
+        dev, e_values, e_tmp_values, slice_indices, slice_sizes);
   }
   return true;
 }
diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc
index 9785e73a4044ebb345a442dd71ae04b42e55cad7..ca3a5f957685d98bfdc3a008ab71d5806814b1eb 100644
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
@@ -67,8 +67,8 @@ class TopkNPUKernel : public framework::OpKernel<T> {
     tmp_indices.mutable_data<int>(ctx.GetPlace());
 
     // run ascend
-    auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
-                              {*output, tmp_indices}, attr_input);
+    const auto& runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor},
+                                     {*output, tmp_indices}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -76,7 +76,7 @@ class TopkNPUKernel : public framework::OpKernel<T> {
 
     // cast indices from INT32 to INT64
     auto dst_dtype = ConvertToNpuDtype(indices->type());
-    auto runner_cast_indices =
+    const auto& runner_cast_indices =
         NpuOpRunner("Cast", {tmp_indices}, {*indices},
                     {{"dst_type", static_cast<int>(dst_dtype)}});
     runner_cast_indices.Run(stream);
diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc
index 623d4c7fc23ba2477d720c46697760efb1dd1429..de71a089b692a9f2ea4c3c59c1fa85cbc47b1e33 100644
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
@@ -167,18 +167,18 @@ REGISTER_OP_CPU_KERNEL(
     ops::TraceKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TraceKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TraceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex64>,
+                     paddle::platform::complex<float>>,
     ops::TraceKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::complex128>);
+                     paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     trace_grad, ops::TraceGradKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TraceGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TraceGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TraceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TraceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::TraceGradKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
 
 /* ==========================  register checkpoint ===========================*/
 REGISTER_OP_VERSION(trace)
diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu
index 2c2745018be40255cd35585b06303506cf4dd386..6798521c8f7470c4a941c2ed5c41f4376305b706 100644
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
@@ -64,9 +64,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::TraceCUDAKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     trace_grad, ops::TraceGradKernel<paddle::platform::CUDADeviceContext, int>,
     ops::TraceGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
@@ -75,6 +75,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TraceGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::TraceGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TraceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::TraceGradKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 465970451f5d105e6a33555ed241c4528e35d50a..95b2c13ff6c631c05ab3abd2cf582ad3603dc031 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -341,17 +341,17 @@ REGISTER_OP_CPU_KERNEL(
     transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     transpose_grad,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex64>,
+                             paddle::platform::complex<float>>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex128>);
+                             paddle::platform::complex<double>>);
 
 REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker,
                   ops::Transpose2GradMaker<paddle::framework::OpDesc>,
@@ -366,9 +366,9 @@ REGISTER_OP_CPU_KERNEL(
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, int64_t>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex64>,
+                         paddle::platform::complex<float>>,
     ops::TransposeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex128>);
+                         paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     transpose2_grad,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, int32_t>,
@@ -376,6 +376,6 @@ REGISTER_OP_CPU_KERNEL(
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex64>,
+                             paddle::platform::complex<float>>,
     ops::TransposeGradKernel<paddle::platform::CPUDeviceContext,
-                             paddle::platform::complex128>);
+                             paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu
index afeb22bd6fa2d4e1c4d222b01d65bff8bf05a74b..a462bbb4834acc502e57e189afb23137b09b73a0 100644
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
@@ -732,9 +732,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex64>,
+                            paddle::platform::complex<float>>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex128>);
+                            paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     transpose_grad,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
@@ -742,9 +742,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
                                 plat::float16>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex64>,
+                                paddle::platform::complex<float>>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex128>);
+                                paddle::platform::complex<double>>);
 
 REGISTER_OP_CUDA_KERNEL(
     transpose2,
@@ -754,9 +754,9 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, double>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext, plat::float16>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex64>,
+                            paddle::platform::complex<float>>,
     ops::TransposeGPUKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex128>);
+                            paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(
     transpose2_grad,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext, int32_t>,
@@ -766,6 +766,6 @@ REGISTER_OP_CUDA_KERNEL(
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
                                 plat::float16>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex64>,
+                                paddle::platform::complex<float>>,
     ops::TransposeGradGPUKernel<paddle::platform::CUDADeviceContext,
-                                paddle::platform::complex128>);
+                                paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc
index 994b8e534f85e2926481d3767f6e75892751d959..035ad5f3f314aaa00f6f717e564c1933f3b7c562 100644
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
@@ -29,7 +29,7 @@ class TransposeNPUKernel : public framework::OpKernel<T> {
     std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
     framework::NPUAttributeMap attr_input = {{"perm", axis}};
     out->mutable_data<T>(ctx.device_context().GetPlace());
-    auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
+    const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
@@ -52,7 +52,8 @@ class TransposeGradNPUKernel : public framework::OpKernel<T> {
     }
     x_grad->mutable_data<T>(ctx.GetPlace());
     framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}};
-    auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
+    const auto& runner =
+        NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input);
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
index 7f3190d9112c66a09b1a5c7432a06b6e4a4ead6f..1cc46e7265f63992092ab260e8cbf3f756e05db6 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
@@ -59,7 +59,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-    auto runner = NpuOpRunner(
+    const auto& runner = NpuOpRunner(
         "ParameterizedTruncatedNormal",
         {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out},
         {{"seed", seed_var}});
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 12a54fd7e87f447530888b85f15a66d1e16fdc9c..36a956762174e18ed7eef1d6e1158b82bf3ceeae 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,7 +1,7 @@
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
 proto_library(error_codes_proto SRCS error_codes.proto)
 if(WITH_GPU)
-  proto_library(cuda_error_proto SRCS cuda_error.proto)
+  proto_library(external_error_proto SRCS external_error.proto)
 endif(WITH_GPU)
 
 if(WITH_XPU)
@@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
 
 set(enforce_deps flags errors boost)
 if(WITH_GPU)
-  set(enforce_deps ${enforce_deps} cuda_error_proto)
+  set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
 cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps})
 cc_library(monitor SRCS monitor.cc)
diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
deleted file mode 100644
index da2f83c3497cce7b162336360690e1e76bce8b19..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/complex128.h
+++ /dev/null
@@ -1,535 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stdint.h>
-
-#include <complex>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_complex.h>
-#include <thrust/complex.h>  // NOLINT
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128
-#endif
-
-namespace paddle {
-namespace platform {
-
-struct PADDLE_ALIGN(16) complex128 {
- public:
-  double real;
-  double imag;
-
-  complex128() = default;
-  complex128(const complex128& o) = default;
-  complex128& operator=(const complex128& o) = default;
-  complex128(complex128&& o) = default;
-  complex128& operator=(complex128&& o) = default;
-  ~complex128() = default;
-
-  HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-  HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) {
-    real = c.real();
-    imag = c.imag();
-  }
-
-  HOSTDEVICE inline explicit operator thrust::complex<double>() const {
-    return thrust::complex<double>(real, imag);
-  }
-
-#ifdef PADDLE_WITH_HIP
-  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
-    return make_hipDoubleComplex(real, imag);
-  }
-#else
-  HOSTDEVICE inline explicit operator cuDoubleComplex() const {
-    return make_cuDoubleComplex(real, imag);
-  }
-#endif
-#endif
-
-  HOSTDEVICE complex128(const float& val)
-      : real(static_cast<double>(val)), imag(0) {}
-  HOSTDEVICE complex128(const double& val) : real(val), imag(0) {}
-  HOSTDEVICE complex128(const int& val)
-      : real(static_cast<double>(val)), imag(0) {}
-  HOSTDEVICE complex128(const int64_t& val)
-      : real(static_cast<double>(val)), imag(0) {}
-
-  HOSTDEVICE inline explicit operator std::complex<double>() {
-    return static_cast<std::complex<double>>(std::complex<double>(real, imag));
-  }
-
-  template <class T>
-  HOSTDEVICE inline explicit complex128(const T& val)
-      : real(complex128(static_cast<double>(val)).real) {}
-
-  HOSTDEVICE complex128(const std::complex<double> val)
-      : real(val.real()), imag(val.imag()) {}
-
-  HOSTDEVICE inline complex128& operator=(bool b) {
-    real = b ? 1 : 0;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int8_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint8_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int16_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint16_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int32_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint32_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(int64_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(uint64_t val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(float val) {
-    real = val;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex128& operator=(double val) {
-    real = static_cast<double>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline operator float() const {
-    return static_cast<float>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator bool() const {
-    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
-  }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(this->real);
-  }
-};
-
-HOSTDEVICE inline complex128 operator+(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) +
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real + b.real, a.imag + b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator-(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) -
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real - b.real, a.imag - b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator*(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) *
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  return complex128(a.real * b.real - a.imag * b.imag,
-                    a.imag * b.real + b.imag * a.real);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator/(const complex128& a,
-                                       const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::complex<double>(a.real, a.imag) /
-                    thrust::complex<double>(b.real, b.imag));
-#else
-  double denominator = b.real * b.real + b.imag * b.imag;
-  return complex128((a.real * b.real + a.imag * b.imag) / denominator,
-                    (a.imag * b.real - a.real * b.imag) / denominator);
-#endif
-}
-
-HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(-thrust::complex<double>(a.real, a.imag));
-#else
-  complex128 res;
-  res.real = -a.real;
-  res.imag = -a.imag;
-  return res;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) +=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real += b.real;
-  a.imag += b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) -=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real -= b.real;
-  a.imag -= b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) *=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  a.real = a.real * b.real - a.imag * b.imag;
-  a.imag = a.imag * b.real + b.imag * a.real;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
-                                         const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex128(thrust::complex<double>(a.real, a.imag) /=
-                 thrust::complex<double>(b.real, b.imag));
-  return a;
-#else
-  double denominator = b.real * b.real + b.imag * b.imag;
-  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
-  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex128 raw_uint16_to_complex128(uint16_t a) {
-  complex128 res;
-  res.real = a;
-  return res;
-}
-
-HOSTDEVICE inline bool operator==(const complex128& a, const complex128& b) {
-  return a.real == b.real && a.imag == b.imag;
-}
-
-HOSTDEVICE inline bool operator!=(const complex128& a, const complex128& b) {
-  return a.real != b.real || a.imag != b.imag;
-}
-
-HOSTDEVICE inline bool operator<(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) < static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator<=(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) <= static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator>(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) > static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
-  return static_cast<double>(a.real) >= static_cast<double>(b.real);
-}
-
-HOSTDEVICE inline bool(isnan)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isnanf not supported on HIP platform
-  return __isnan(a.real) || __isnan(a.imag);
-#else
-  return std::isnan(a.real) || std::isnan(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isinf)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isinf not supported on HIP platform
-  return __isinf(a.real) || __isinf(a.imag);
-#else
-  return std::isinf(a.real) || std::isinf(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isfinite)(const complex128& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return thrust::abs(thrust::complex<double>(a.real, a.imag));
-#else
-  return std::abs(std::complex<double>(a.real, a.imag));
-#endif
-}
-
-HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
-                                thrust::complex<double>(b.real, b.imag)));
-#else
-  return std::pow(std::complex<double>(a), std::complex<float>(b));
-#endif
-}
-
-HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
-#else
-  return std::sqrt(std::complex<double>(a));
-#endif
-}
-
-HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
-#else
-  return std::tanh(std::complex<double>(a));
-#endif
-}
-
-HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
-#else
-  return complex128(std::log(std::complex<double>(a)));
-#endif
-}
-
-inline std::ostream& operator<<(std::ostream& os, const complex128& a) {
-  os << "real:" << a.real << " imag:" << a.imag;
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-namespace std {
-
-template <>
-struct is_pod<paddle::platform::complex128> {
-  static const bool value =
-      is_trivial<paddle::platform::complex128>::value &&
-      is_standard_layout<paddle::platform::complex128>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::complex128>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::complex128,
-                             typename std::remove_cv<
-                                 paddle::platform::complex128>::type>::value> {
-};
-template <>
-struct is_signed<paddle::platform::complex128> {
-  static const bool value = false;
-};
-
-template <>
-struct is_unsigned<paddle::platform::complex128> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::complex128& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::complex128& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::complex128> {
-  static const bool is_specialized = false;
-  static const bool is_signed = false;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = false;
-  static const bool has_quiet_NaN = false;
-  static const bool has_signaling_NaN = false;
-  static const float_denorm_style has_denorm = denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_toward_zero;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 0;
-  static const int digits10 = 0;
-  static const int max_digits10 = 0;
-  static const int radix = 0;
-  static const int min_exponent = 0;
-  static const int min_exponent10 = 0;
-  static const int max_exponent = 0;
-  static const int max_exponent10 = 0;
-  static const bool traps = false;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::complex128(min)() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 lowest() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128(max)() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 epsilon() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 round_error() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 infinity() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 quiet_NaN() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 signaling_NaN() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-  static paddle::platform::complex128 denorm_min() {
-    return paddle::platform::complex128(0.0, 0.0);
-  }
-};
-
-}  // namespace std
-
-#define MKL_Complex16 paddle::platform::complex128
diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h
deleted file mode 100644
index 0aad7bd9dd2a8f1d59833720b442e34afa176ca6..0000000000000000000000000000000000000000
--- a/paddle/fluid/platform/complex64.h
+++ /dev/null
@@ -1,538 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <stdint.h>
-
-#include <complex>
-#include <cstring>
-#include <iostream>
-#include <limits>
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_CUDA
-
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_complex.h>
-#include <thrust/complex.h>  // NOLINT
-#endif
-
-#if !defined(_WIN32)
-#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
-#else
-#define PADDLE_ALIGN(x) __declspec(align(x))
-#endif
-
-#if (defined(__CUDACC__) || defined(__HIPCC__))
-#define HOSTDEVICE __host__ __device__
-#define DEVICE __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define DEVICE
-#define HOST
-#endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX64
-#endif
-
-#include "complex128.h"  // NOLINT
-
-namespace paddle {
-namespace platform {
-
-struct PADDLE_ALIGN(8) complex64 {
- public:
-  float real;
-  float imag;
-
-  complex64() = default;
-  complex64(const complex64& o) = default;
-  complex64& operator=(const complex64& o) = default;
-  complex64(complex64&& o) = default;
-  complex64& operator=(complex64&& o) = default;
-  ~complex64() = default;
-
-  HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
-  HOSTDEVICE inline explicit complex64(const thrust::complex<float>& c) {
-    real = c.real();
-    imag = c.imag();
-  }
-
-  HOSTDEVICE inline explicit operator thrust::complex<float>() const {
-    return thrust::complex<float>(real, imag);
-  }
-
-#ifdef PADDLE_WITH_HIP
-  HOSTDEVICE inline explicit operator hipFloatComplex() const {
-    return make_hipFloatComplex(real, imag);
-  }
-#else
-  HOSTDEVICE inline explicit operator cuFloatComplex() const {
-    return make_cuFloatComplex(real, imag);
-  }
-#endif
-#endif
-
-  HOSTDEVICE complex64(const float& val) : real(val), imag(0) {}
-  HOSTDEVICE complex64(const double& val)
-      : real(static_cast<float>(val)), imag(0) {}
-  HOSTDEVICE complex64(const int& val)
-      : real(static_cast<float>(val)), imag(0) {}
-  HOSTDEVICE complex64(const int64_t& val)
-      : real(static_cast<float>(val)), imag(0) {}
-  HOSTDEVICE complex64(const complex128& val)
-      : real(static_cast<float>(val.real)),
-        imag(static_cast<float>(val.imag)) {}
-
-  HOSTDEVICE inline explicit operator std::complex<float>() {
-    return static_cast<std::complex<float>>(std::complex<float>(real, imag));
-  }
-
-  template <class T>
-  HOSTDEVICE inline explicit complex64(const T& val)
-      : real(complex64(static_cast<float>(val)).real) {}
-
-  HOSTDEVICE complex64(const std::complex<float> val)
-      : real(val.real()), imag(val.imag()) {}
-
-  HOSTDEVICE inline complex64& operator=(bool b) {
-    real = b ? 1 : 0;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(int8_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(uint8_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(int16_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(uint16_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(int32_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(uint32_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(int64_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(uint64_t val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(float val) {
-    real = val;
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline complex64& operator=(double val) {
-    real = static_cast<float>(val);
-    imag = 0;
-    return *this;
-  }
-
-  HOSTDEVICE inline operator float() const { return this->real; }
-
-  HOSTDEVICE inline explicit operator bool() const {
-    return static_cast<bool>(this->real) || static_cast<bool>(this->imag);
-  }
-
-  HOSTDEVICE inline explicit operator int8_t() const {
-    return static_cast<int8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint8_t() const {
-    return static_cast<uint8_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int16_t() const {
-    return static_cast<int16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint16_t() const {
-    return static_cast<uint16_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int32_t() const {
-    return static_cast<int32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint32_t() const {
-    return static_cast<uint32_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator int64_t() const {
-    return static_cast<int64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator uint64_t() const {
-    return static_cast<uint64_t>(this->real);
-  }
-
-  HOSTDEVICE inline explicit operator double() const {
-    return static_cast<double>(this->real);
-  }
-
-  HOSTDEVICE inline operator complex128() const {
-    return complex128(static_cast<double>(this->real),
-                      static_cast<double>(this->imag));
-  }
-};
-
-HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::complex<float>(a.real, a.imag) +
-                   thrust::complex<float>(b.real, b.imag));
-#else
-  return complex64(a.real + b.real, a.imag + b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::complex<float>(a.real, a.imag) -
-                   thrust::complex<float>(b.real, b.imag));
-#else
-  return complex64(a.real - b.real, a.imag - b.imag);
-#endif
-}
-
-HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::complex<float>(a.real, a.imag) *
-                   thrust::complex<float>(b.real, b.imag));
-#else
-  return complex64(a.real * b.real - a.imag * b.imag,
-                   a.imag * b.real + b.imag * a.real);
-#endif
-}
-
-HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::complex<float>(a.real, a.imag) /
-                   thrust::complex<float>(b.real, b.imag));
-#else
-  float denominator = b.real * b.real + b.imag * b.imag;
-  return complex64((a.real * b.real + a.imag * b.imag) / denominator,
-                   (a.imag * b.real - a.real * b.imag) / denominator);
-#endif
-}
-
-HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(-thrust::complex<float>(a.real, a.imag));
-#else
-  complex64 res;
-  res.real = -a.real;
-  res.imag = -a.imag;
-  return res;
-#endif
-}
-
-HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
-                                        const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex64(thrust::complex<float>(a.real, a.imag) +=
-                thrust::complex<float>(b.real, b.imag));
-  return a;
-#else
-  a.real += b.real;
-  a.imag += b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
-                                        const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex64(thrust::complex<float>(a.real, a.imag) -=
-                thrust::complex<float>(b.real, b.imag));
-  return a;
-#else
-  a.real -= b.real;
-  a.imag -= b.imag;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
-                                        const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex64(thrust::complex<float>(a.real, a.imag) *=
-                thrust::complex<float>(b.real, b.imag));
-  return a;
-#else
-  a.real = a.real * b.real - a.imag * b.imag;
-  a.imag = a.imag * b.real + b.imag * a.real;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
-                                        const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  a = complex64(thrust::complex<float>(a.real, a.imag) /=
-                thrust::complex<float>(b.real, b.imag));
-  return a;
-#else
-  float denominator = b.real * b.real + b.imag * b.imag;
-  a.real = (a.real * b.real + a.imag * b.imag) / denominator;
-  a.imag = (a.imag * b.real - a.real * b.imag) / denominator;
-  return a;
-#endif
-}
-
-HOSTDEVICE inline complex64 raw_uint16_to_complex64(uint16_t a) {
-  complex64 res;
-  res.real = a;
-  return res;
-}
-
-HOSTDEVICE inline bool operator==(const complex64& a, const complex64& b) {
-  return a.real == b.real && a.imag == b.imag;
-}
-
-HOSTDEVICE inline bool operator!=(const complex64& a, const complex64& b) {
-  return a.real != b.real || a.imag != b.imag;
-}
-
-HOSTDEVICE inline bool operator<(const complex64& a, const complex64& b) {
-  return static_cast<float>(a.real) < static_cast<float>(b.real);
-}
-
-HOSTDEVICE inline bool operator<=(const complex64& a, const complex64& b) {
-  return static_cast<float>(a.real) <= static_cast<float>(b.real);
-}
-
-HOSTDEVICE inline bool operator>(const complex64& a, const complex64& b) {
-  return static_cast<float>(a.real) > static_cast<float>(b.real);
-}
-
-HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
-  return static_cast<float>(a.real) >= static_cast<float>(b.real);
-}
-
-HOSTDEVICE inline bool(isnan)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isnanf not supported on HIP platform
-  return __isnanf(a.real) || __isnanf(a.imag);
-#else
-  return std::isnan(a.real) || std::isnan(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isinf)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__)
-  // __isinff not supported on HIP platform
-  return __isinff(a.real) || __isinff(a.imag);
-#else
-  return std::isinf(a.real) || std::isinf(a.imag);
-#endif
-}
-
-HOSTDEVICE inline bool(isfinite)(const complex64& a) {
-  return !((isnan)(a)) && !((isinf)(a));
-}
-
-HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
-#else
-  return std::abs(std::complex<float>(a.real, a.imag));
-#endif
-}
-
-HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
-                               thrust::complex<float>(b.real, b.imag)));
-#else
-  return std::pow(std::complex<float>(a), std::complex<float>(b));
-#endif
-}
-
-HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
-#else
-  return std::sqrt(std::complex<float>(a));
-#endif
-}
-
-HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
-#else
-  return std::tanh(std::complex<float>(a));
-#endif
-}
-
-HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \
-    (defined(__CUDA_ARCH__) || defined(__HIPCC__))
-  return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
-#else
-  return std::log(std::complex<float>(a));
-#endif
-}
-
-inline std::ostream& operator<<(std::ostream& os, const complex64& a) {
-  os << "real:" << a.real << " imag:" << a.imag;
-  return os;
-}
-
-}  // namespace platform
-}  // namespace paddle
-
-namespace std {
-
-template <>
-struct is_pod<paddle::platform::complex64> {
-  static const bool value =
-      is_trivial<paddle::platform::complex64>::value &&
-      is_standard_layout<paddle::platform::complex64>::value;
-};
-
-template <>
-struct is_floating_point<paddle::platform::complex64>
-    : std::integral_constant<
-          bool, std::is_same<paddle::platform::complex64,
-                             typename std::remove_cv<
-                                 paddle::platform::complex64>::type>::value> {};
-template <>
-struct is_signed<paddle::platform::complex64> {
-  static const bool value = false;
-};
-
-template <>
-struct is_unsigned<paddle::platform::complex64> {
-  static const bool value = false;
-};
-
-inline bool isnan(const paddle::platform::complex64& a) {
-  return paddle::platform::isnan(a);
-}
-
-inline bool isinf(const paddle::platform::complex64& a) {
-  return paddle::platform::isinf(a);
-}
-
-template <>
-struct numeric_limits<paddle::platform::complex64> {
-  static const bool is_specialized = false;
-  static const bool is_signed = false;
-  static const bool is_integer = false;
-  static const bool is_exact = false;
-  static const bool has_infinity = false;
-  static const bool has_quiet_NaN = false;
-  static const bool has_signaling_NaN = false;
-  static const float_denorm_style has_denorm = denorm_absent;
-  static const bool has_denorm_loss = false;
-  static const std::float_round_style round_style = std::round_toward_zero;
-  static const bool is_iec559 = false;
-  static const bool is_bounded = false;
-  static const bool is_modulo = false;
-  static const int digits = 0;
-  static const int digits10 = 0;
-  static const int max_digits10 = 0;
-  static const int radix = 0;
-  static const int min_exponent = 0;
-  static const int min_exponent10 = 0;
-  static const int max_exponent = 0;
-  static const int max_exponent10 = 0;
-  static const bool traps = false;
-  static const bool tinyness_before = false;
-
-  static paddle::platform::complex64(min)() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 lowest() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64(max)() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 epsilon() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 round_error() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 infinity() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 quiet_NaN() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 signaling_NaN() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-  static paddle::platform::complex64 denorm_min() {
-    return paddle::platform::complex64(0.0, 0.0);
-  }
-};
-
-}  // namespace std
-
-#define MKL_Complex8 paddle::platform::complex64
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index 4095720f71eb7185c474934231220b917a770375..352143302388a9f8169a40a14ccea9bae647cfc6 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -31,6 +31,7 @@ namespace platform {
 #endif
 
 inline static int RoundToPowerOfTwo(int dim) {
+#ifdef PADDLE_WITH_CUDA
   if (dim > 512) {
     return 1024;
   } else if (dim > 256) {
@@ -44,6 +45,17 @@ inline static int RoundToPowerOfTwo(int dim) {
   } else {
     return 32;
   }
+#else  // HIP results in error or nan if > 256
+  if (dim > 128) {
+    return 256;
+  } else if (dim > 64) {
+    return 128;
+  } else if (dim > 32) {
+    return 64;
+  } else {
+    return 32;
+  }
+#endif
 }
 
 #define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 94f64d158afbcbc702e5c1a47cefb61a9118067b..4708a99e8fc4ca9682500602da95a710d34e268e 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -20,8 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -135,18 +134,18 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
 }
 #endif
 
-CUDA_ATOMIC_WRAPPER(Add, complex64) {
+CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
   float *imag = real + 1;
-  return complex64(CudaAtomicAdd(real, val.real),
-                   CudaAtomicAdd(imag, val.imag));
+  return complex<float>(CudaAtomicAdd(real, val.real),
+                        CudaAtomicAdd(imag, val.imag));
 }
 
-CUDA_ATOMIC_WRAPPER(Add, complex128) {
+CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
   double *real = reinterpret_cast<double *>(address);
   double *imag = real + 1;
-  return complex128(CudaAtomicAdd(real, val.real),
-                    CudaAtomicAdd(imag, val.imag));
+  return complex<double>(CudaAtomicAdd(real, val.real),
+                         CudaAtomicAdd(imag, val.imag));
 }
 
 // For atomicMax
diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
index 05a431e731e32c2b36f0aebfa11cb95f2607929c..8e969588afbbcf5d49f71f5165668cb7fb946e6c 100644
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
@@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType(
     case framework::proto::VarType::FP64:
       type = CUDNN_DATA_DOUBLE;
       break;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    case framework::proto::VarType::BF16:
+      type = CUDNN_DATA_BFLOAT16;
+      break;
+#endif
     default:
       break;
   }
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index 6c3c96b68c48a1314f4a90a97a2542ea3060446a..65dd69a37d37f8116deee0e63ab89d9249f908ba 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic);
 namespace paddle {
 namespace platform {
 
-inline const char* cudnnGetErrorString(cudnnStatus_t status) {
-  switch (status) {
-    case CUDNN_STATUS_SUCCESS:
-      return "CUDNN_STATUS_SUCCESS";
-    case CUDNN_STATUS_NOT_INITIALIZED:
-      return "CUDNN_STATUS_NOT_INITIALIZED";
-    case CUDNN_STATUS_ALLOC_FAILED:
-      return "CUDNN_STATUS_ALLOC_FAILED";
-    case CUDNN_STATUS_BAD_PARAM:
-      return "CUDNN_STATUS_BAD_PARAM";
-    case CUDNN_STATUS_INTERNAL_ERROR:
-      return "CUDNN_STATUS_INTERNAL_ERROR";
-    case CUDNN_STATUS_INVALID_VALUE:
-      return "CUDNN_STATUS_INVALID_VALUE";
-    case CUDNN_STATUS_ARCH_MISMATCH:
-      return "CUDNN_STATUS_ARCH_MISMATCH";
-    case CUDNN_STATUS_MAPPING_ERROR:
-      return "CUDNN_STATUS_MAPPING_ERROR";
-    case CUDNN_STATUS_EXECUTION_FAILED:
-      return "CUDNN_STATUS_EXECUTION_FAILED";
-    case CUDNN_STATUS_NOT_SUPPORTED:
-      return "CUDNN_STATUS_NOT_SUPPORTED";
-    case CUDNN_STATUS_LICENSE_ERROR:
-      return "CUDNN_STATUS_LICENSE_ERROR";
-    default:
-      return "Unknown cudnn error number";
-  }
-}
-
 #define CUDNN_VERSION_MIN(major, minor, patch) \
   (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
 
@@ -131,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
 template <typename T>
 class CudnnDataType;
 
+// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+#endif
+
 template <>
 class CudnnDataType<float16> {
  public:
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e62f0673e97fadc68de6c7f08591a941e035a4b8..8d9d1fd96f463c8e05e9c7e6ba7ed42672459bec 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -358,15 +358,16 @@ class CUDAContext {
       PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion(
           &miopen_major, &miopen_minor, &miopen_patch));
       auto local_miopen_version =
-          (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100;
-      auto compile_miopen_version = MIOPEN_VERSION / 100;
+          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+      auto compile_miopen_version = MIOPEN_VERSION / 10;
       if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
         LOG_FIRST_N(WARNING, 1)
             << "WARNING: device: " << place_.device
             << ". The installed Paddle is compiled with MIOPEN "
-            << compile_miopen_version / 10 << "." << compile_miopen_version % 10
+            << compile_miopen_version / 100 << "."
+            << compile_miopen_version % 100
             << ", but MIOPEN version in your machine is "
-            << local_miopen_version / 10 << "." << local_miopen_version % 10
+            << local_miopen_version / 100 << "." << local_miopen_version % 100
             << ", which may cause serious incompatible bug. "
             << "Please recompile or reinstall Paddle with compatible MIOPEN "
                "version.";
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index 724a9b8483cdee5d98cd2988aea7e57c9bfc8ff5..1bd46c0bfafaab92a2217751ee80ce1872af4474 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -511,7 +511,7 @@ class DeviceTracerImpl : public DeviceTracer {
       auto c = correlations_.find(r.correlation_id);
       if (c != correlations_.end() && c->second != nullptr) {
         event->set_name(c->second->name());
-        event->set_detail_info(r.name);
+        event->set_detail_info(c->second->attr());
         find++;
       } else {
         VLOG(10) << "Missing Kernel Event: " + r.name;
diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h
index 77ff3f3ccbbb6ee395beae7f4f8cb270b714a961..f72eb6731f6276c049b2fe397cda660fd61c1def 100644
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/platform/port.h"
 
-#define MIOPEN_VERSION                                        \
-  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 100 + \
+#define MIOPEN_VERSION                                       \
+  (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \
    MIOPEN_VERSION_PATCH)  // NOLINT
 
 namespace paddle {
diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h
index 4eea87e909d1b45e1f5323f774138529961961c8..09b8c8137fcd1f8e222562295cc28a2f2b1b7a67 100644
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
@@ -16,8 +16,6 @@
 
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
@@ -25,8 +23,6 @@
 
 namespace Eigen {
 
-using complex64 = paddle::platform::complex64;
-using complex128 = paddle::platform::complex128;
 using float16 = paddle::platform::float16;
 template <typename T>
 using complex = paddle::platform::complex<T>;
@@ -64,50 +60,6 @@ struct NumTraits<paddle::platform::bfloat16>
   }
 };
 
-template <>
-struct NumTraits<complex64> : GenericNumTraits<std::complex<float>> {
-  typedef float Real;
-  typedef typename NumTraits<float>::Literal Literal;
-  enum {
-    IsComplex = 1,
-    RequireInitialization = NumTraits<float>::RequireInitialization,
-    ReadCost = 2 * NumTraits<float>::ReadCost,
-    AddCost = 2 * NumTraits<Real>::AddCost,
-    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
-  static inline Real dummy_precision() {
-    return NumTraits<Real>::dummy_precision();
-  }
-  EIGEN_DEVICE_FUNC
-  static inline int digits10() { return NumTraits<Real>::digits10(); }
-};
-
-template <>
-struct NumTraits<complex128> : GenericNumTraits<std::complex<double>> {
-  typedef double Real;
-  typedef typename NumTraits<double>::Literal Literal;
-  enum {
-    IsComplex = 1,
-    RequireInitialization = NumTraits<double>::RequireInitialization,
-    ReadCost = 2 * NumTraits<double>::ReadCost,
-    AddCost = 2 * NumTraits<Real>::AddCost,
-    MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
-  };
-
-  EIGEN_DEVICE_FUNC
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  EIGEN_DEVICE_FUNC
-  static inline Real dummy_precision() {
-    return NumTraits<Real>::dummy_precision();
-  }
-  EIGEN_DEVICE_FUNC
-  static inline int digits10() { return NumTraits<Real>::digits10(); }
-};
-
 template <>
 struct NumTraits<complex<float>> : GenericNumTraits<std::complex<float>> {
   typedef float Real;
@@ -271,136 +223,6 @@ HOSTDEVICE inline paddle::platform::bfloat16 maxi(
   return a < b ? b : a;
 }
 
-//////////// complex64 methods /////////////
-
-template <>
-HOSTDEVICE inline bool(isnan)(const complex64& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const complex64& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const complex64& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 exp(const complex64& a) {
-  float com = ::expf(a.real);
-  float res_real = com * ::cosf(a.imag);
-  float res_imag = com * ::sinf(a.imag);
-  return complex64(res_real, res_imag);
-}
-
-template <>
-HOSTDEVICE inline complex64 log(const complex64& a) {
-  return paddle::platform::log(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 tanh(const complex64& a) {
-  return paddle::platform::tanh(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 sqrt(const complex64& a) {
-  return paddle::platform::sqrt(a);
-}
-
-template <>
-HOSTDEVICE inline complex64 ceil(const complex64& a) {
-  return complex64(::ceilf(a.real), ::ceilf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 floor(const complex64& a) {
-  return complex64(::floorf(a.real), ::floor(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 round(const complex64& a) {
-  return complex64(::roundf(a.real), ::roundf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) {
-  return paddle::platform::pow(a, b);
-}
-
-template <>
-HOSTDEVICE inline float abs(const complex64& a) {
-  return paddle::platform::abs(a);
-}
-
-//////////// complex128 methods /////////////
-
-template <>
-HOSTDEVICE inline bool(isnan)(const complex128& a) {
-  return (paddle::platform::isnan)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isinf)(const complex128& a) {
-  return (paddle::platform::isinf)(a);
-}
-
-template <>
-HOSTDEVICE inline bool(isfinite)(const complex128& a) {
-  return (paddle::platform::isfinite)(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 exp(const complex128& a) {
-  double com = ::expf(a.real);
-  double res_real = com * ::cosf(a.imag);
-  double res_imag = com * ::sinf(a.imag);
-  return complex128(res_real, res_imag);
-}
-
-template <>
-HOSTDEVICE inline complex128 log(const complex128& a) {
-  return paddle::platform::log(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 tanh(const complex128& a) {
-  return paddle::platform::tanh(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 sqrt(const complex128& a) {
-  return paddle::platform::sqrt(a);
-}
-
-template <>
-HOSTDEVICE inline complex128 ceil(const complex128& a) {
-  return complex128(::ceilf(a.real), ::ceilf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 floor(const complex128& a) {
-  return complex128(::floorf(a.real), ::floor(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 round(const complex128& a) {
-  return complex128(::roundf(a.real), ::roundf(a.imag));
-}
-
-template <>
-HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) {
-  return paddle::platform::pow(a, b);
-}
-
-template <>
-HOSTDEVICE inline double abs(const complex128& a) {
-  return paddle::platform::abs(a);
-}
-
 //////////// complex<float> methods /////////////
 
 template <>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index d42733823e669b03daa8f29dfa0c40be38de1069..d3890de89a5d140bfa09d04909f703f4ca771a05 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include <curand.h>
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
-#include "paddle/fluid/platform/cuda_error.pb.h"
+#include "paddle/fluid/platform/external_error.pb.h"
 #endif  // PADDLE_WITH_CUDA
 
 #ifdef PADDLE_WITH_HIP
@@ -682,41 +682,83 @@ struct EOFException : public std::exception {
     END_HANDLE_THE_ERROR                                                     \
   } while (0)
 
-/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
+/**************************************************************************/
+/**************************** NVIDIA ERROR ********************************/
 #ifdef PADDLE_WITH_CUDA
 
-/***** CUDA ERROR *****/
-inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
+namespace details {
 
-inline std::string GetCudaErrorWebsite(int32_t cuda_version) {
-  std::ostringstream webstr;
-  webstr << "https://docs.nvidia.com/cuda/";
-  if (cuda_version != -1) {
-    double version = cuda_version / 10;
-    webstr << "archive/" << std::fixed << std::setprecision(1) << version;
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \
+  template <>                                                     \
+  struct ExternalApiType<type> {                                  \
+    using Type = type;                                            \
+    static constexpr Type kSuccess = success_value;               \
+    static constexpr const char* kTypeString = #proto_type;       \
+    static constexpr platform::proto::ApiType kProtoType =        \
+        platform::proto::ApiType::proto_type;                     \
   }
-  webstr << "/cuda-runtime-api/group__CUDART__TYPES.html"
-            "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038";
-  return webstr.str();
-}
 
-inline std::string build_nvidia_error_msg(cudaError_t e) {
-#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000
-  int32_t cuda_version = 100;
-#elif CUDA_VERSION >= 9000
-  int32_t cuda_version = 90;
-#else
-  int32_t cuda_version = -1;
+DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA);
+DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND);
+DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN);
+DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS);
+DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER);
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL);
 #endif
+
+}  // namespace details
+
+template <typename T>
+inline const char* GetErrorMsgUrl(T status) {
+  using __CUDA_STATUS_TYPE__ = decltype(status);
+  platform::proto::ApiType proto_type =
+      details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
+  switch (proto_type) {
+    case platform::proto::ApiType::CUDA:
+      return "https://docs.nvidia.com/cuda/cuda-runtime-api/"
+             "group__CUDART__TYPES.html#group__CUDART__TYPES_"
+             "1g3f51e3575c2178246db0a94a430e0038";
+      break;
+    case platform::proto::ApiType::CURAND:
+      return "https://docs.nvidia.com/cuda/curand/"
+             "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437";
+      break;
+    case platform::proto::ApiType::CUDNN:
+      return "https://docs.nvidia.com/deeplearning/cudnn/api/"
+             "index.html#cudnnStatus_t";
+      break;
+    case platform::proto::ApiType::CUBLAS:
+      return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t";
+      break;
+    case platform::proto::ApiType::CUSOLVER:
+      return "https://docs.nvidia.com/cuda/cusolver/"
+             "index.html#cuSolverSPstatus";
+      break;
+    case platform::proto::ApiType::NCCL:
+      return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/"
+             "types.html#ncclresult-t";
+      break;
+    default:
+      return "Unknown type of External API, can't get error message URL!";
+      break;
+  }
+}
+
+template <typename T>
+inline std::string GetExternalErrorMsg(T status) {
   std::ostringstream sout;
-  sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << ".";
-  static platform::proto::cudaerrorDesc cudaerror;
-  static bool _initSucceed = false;
-  if (cudaerror.ByteSizeLong() == 0) {
+  bool _initSucceed = false;
+  platform::proto::ExternalErrorDesc externalError;
+  if (externalError.ByteSizeLong() == 0) {
     std::string filePath;
 #if !defined(_WIN32)
     Dl_info info;
-    if (dladdr(reinterpret_cast<void*>(GetCudaErrorWebsite), &info)) {
+    if (dladdr(reinterpret_cast<void*>(GetCurrentTraceBackString), &info)) {
       std::string strModule(info.dli_fname);
       const size_t last_slash_idx = strModule.find_last_of("/");
       std::string compare_path = strModule.substr(strModule.length() - 6);
@@ -724,18 +766,19 @@ inline std::string build_nvidia_error_msg(cudaError_t e) {
         strModule.erase(last_slash_idx, std::string::npos);
       }
       if (compare_path.compare("avx.so") == 0) {
-        filePath = strModule +
-                   "/../include/third_party/cudaerror/data/cudaErrorMessage.pb";
-      } else {
         filePath =
-            strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb";
+            strModule +
+            "/../include/third_party/externalError/data/externalErrorMsg.pb";
+      } else {
+        filePath = strModule +
+                   "/../../third_party/externalError/data/externalErrorMsg.pb";
       }
     }
 #else
     char buf[100];
     MEMORY_BASIC_INFORMATION mbi;
     HMODULE h_module =
-        (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0)
+        (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0)
             ? (HMODULE)mbi.AllocationBase
             : NULL;
     GetModuleFileName(h_module, buf, 100);
@@ -746,198 +789,118 @@ inline std::string build_nvidia_error_msg(cudaError_t e) {
       strModule.erase(last_slash_idx, std::string::npos);
     }
     if (compare_path.compare("avx.pyd") == 0) {
-      filePath =
-          strModule +
-          "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+      filePath = strModule +
+                 "\\..\\include\\third_"
+                 "party\\externalerror\\data\\externalErrorMsg.pb";
     } else {
       filePath =
-          strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb";
+          strModule +
+          "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb";
     }
 #endif
     std::ifstream fin(filePath, std::ios::in | std::ios::binary);
-    _initSucceed = cudaerror.ParseFromIstream(&fin);
+    _initSucceed = externalError.ParseFromIstream(&fin);
   }
+  using __CUDA_STATUS_TYPE__ = decltype(status);
+  platform::proto::ApiType proto_type =
+      details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType;
   if (_initSucceed) {
-    for (int i = 0; i < cudaerror.allmessages_size(); ++i) {
-      if (cuda_version == cudaerror.allmessages(i).version()) {
-        for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) {
-          if (e == cudaerror.allmessages(i).messages(j).errorcode()) {
-            sout << "\n  [Advise: "
-                 << cudaerror.allmessages(i).messages(j).errormessage() << "]";
+    for (int i = 0; i < externalError.errors_size(); ++i) {
+      if (proto_type == externalError.errors(i).type()) {
+        for (int j = 0; j < externalError.errors(i).messages_size(); ++j) {
+          if (status == externalError.errors(i).messages(j).code()) {
+            sout << "\n  [Hint: "
+                 << externalError.errors(i).messages(j).message() << "]";
             return sout.str();
           }
         }
       }
     }
   }
-  sout << "\n  [Advise: Please search for the error code(" << e
-       << ") on website( " << GetCudaErrorWebsite(cuda_version)
-       << " ) to get Nvidia's official solution about CUDA Error.]";
+
+  sout << "\n  [Hint: Please search for the error code(" << status
+       << ") on website (" << GetErrorMsgUrl(status)
+       << ") to get Nvidia's official solution and advice about "
+       << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString
+       << " Error.]";
   return sout.str();
 }
 
-/** curand ERROR **/
-inline bool is_error(curandStatus_t stat) {
-  return stat != CURAND_STATUS_SUCCESS;
+template std::string GetExternalErrorMsg<cudaError_t>(cudaError_t);
+template std::string GetExternalErrorMsg<curandStatus_t>(curandStatus_t);
+template std::string GetExternalErrorMsg<cudnnStatus_t>(cudnnStatus_t);
+template std::string GetExternalErrorMsg<cublasStatus_t>(cublasStatus_t);
+template std::string GetExternalErrorMsg<cusolverStatus_t>(cusolverStatus_t);
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+template std::string GetExternalErrorMsg<ncclResult_t>(ncclResult_t);
+#endif
+
+/*************** CUDA ERROR ***************/
+inline bool is_error(cudaError_t e) { return e != cudaSuccess; }
+
+inline std::string build_nvidia_error_msg(cudaError_t e) {
+  std::ostringstream sout;
+  sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". "
+       << GetExternalErrorMsg(e);
+  return sout.str();
 }
 
-inline const char* curandGetErrorString(curandStatus_t stat) {
-  switch (stat) {
-    case CURAND_STATUS_SUCCESS:
-      return "`CURAND_STATUS_SUCCESS`. No errors.";
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "`CURAND_STATUS_VERSION_MISMATCH`. Header file and linked library "
-             "version do not match.";
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "`CURAND_STATUS_NOT_INITIALIZED`. Generator not initialized.";
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "`CURAND_STATUS_ALLOCATION_FAILED`. Memory allocation failed.";
-    case CURAND_STATUS_TYPE_ERROR:
-      return "`CURAND_STATUS_TYPE_ERROR`. Generator is wrong type.";
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "`CURAND_STATUS_OUT_OF_RANGE`. Argument out of range.";
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "`CURAND_STATUS_LENGTH_NOT_MULTIPLE`. Length requested is not a "
-             "multple of dimension.";
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`. GPU does not have "
-             "double precision required by MRG32k3a.";
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "`CURAND_STATUS_LAUNCH_FAILURE`. Kernel launch failure.";
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "`CURAND_STATUS_PREEXISTING_FAILURE`. Preexisting failure on "
-             "library entry.";
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "`CURAND_STATUS_INITIALIZATION_FAILED`. Initialization of CUDA "
-             "failed.";
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "`CURAND_STATUS_ARCH_MISMATCH`. Architecture mismatch, GPU does "
-             "not support requested feature.";
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "`CURAND_STATUS_INTERNAL_ERROR`. Internal library error.";
-    default:
-      return "Unknown curand status";
-  }
+/*************** CURAND ERROR ***************/
+inline bool is_error(curandStatus_t stat) {
+  return stat != CURAND_STATUS_SUCCESS;
 }
 
 inline std::string build_nvidia_error_msg(curandStatus_t stat) {
-  std::string msg(" Curand error, ");
-  return msg + curandGetErrorString(stat) + " ";
+  std::ostringstream sout;
+  sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
 }
 
-/***** CUDNN ERROR *****/
+/*************** CUDNN ERROR ***************/
 inline bool is_error(cudnnStatus_t stat) {
   return stat != CUDNN_STATUS_SUCCESS;
 }
 
 inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
-  std::string msg(" Cudnn error, ");
-  return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
+  std::ostringstream sout;
+  sout << "CUDNN error(" << stat << "), "
+       << platform::dynload::cudnnGetErrorString(stat) << ". "
+       << GetExternalErrorMsg(stat);
+  return sout.str();
 }
 
-/***** CUBLAS ERROR *****/
+/*************** CUBLAS ERROR ***************/
 inline bool is_error(cublasStatus_t stat) {
   return stat != CUBLAS_STATUS_SUCCESS;
 }
 
-inline const char* cublasGetErrorString(cublasStatus_t stat) {
-  switch (stat) {
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "`CUBLAS_STATUS_NOT_INITIALIZED`. The cuBLAS library was not "
-             "initialized.";
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "`CUBLAS_STATUS_ALLOC_FAILED`. Resource allocation failed inside "
-             "the cuBLAS library.";
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "`CUBLAS_STATUS_INVALID_VALUE`. An unsupported value or parameter "
-             "was passed to the function (a negative vector size, for "
-             "example).";
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "`CUBLAS_STATUS_ARCH_MISMATCH`. The function requires a feature "
-             "absent from the device architecture; usually caused by the lack "
-             "of support for double precision.";
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "`CUBLAS_STATUS_MAPPING_ERROR`. An access to GPU memory space "
-             "failed, which is usually caused by a failure to bind a texture.";
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "`CUBLAS_STATUS_EXECUTION_FAILED`. The GPU program failed to "
-             "execute. This is often caused by a launch failure of the kernel "
-             "on the GPU, which can be caused by multiple reasons.";
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "`CUBLAS_STATUS_INTERNAL_ERROR`. An internal cuBLAS operation "
-             "failed. This error is usually caused by a cudaMemcpyAsync() "
-             "failure.";
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "`CUBLAS_STATUS_NOT_SUPPORTED`. The functionality requested is "
-             "not supported.";
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "`CUBLAS_STATUS_LICENSE_ERROR`. The functionality requested "
-             "requires some license and an error was detected when trying to "
-             "check the current licensing.";
-    default:
-      return "Unknown cublas status";
-  }
-}
-
 inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
-  std::string msg(" Cublas error, ");
-  return msg + cublasGetErrorString(stat) + " ";
+  std::ostringstream sout;
+  sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
 }
 
-/***** CUSOLVER ERROR *****/
+/*************** CUSOLVER ERROR ***************/
 inline bool is_error(cusolverStatus_t stat) {
   return stat != CUSOLVER_STATUS_SUCCESS;
 }
 
-inline const char* cusolverGetErrorString(cusolverStatus_t stat) {
-  switch (stat) {
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return "`CUSOLVER_STATUS_NOT_INITIALIZED`. The cuSolver library was not "
-             "initialized. This is usually caused by the lack of a prior call, "
-             "an error in the CUDA Runtime API called by the cuSolver routine, "
-             "or an error in the hardware setup.";
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return "`CUSOLVER_STATUS_ALLOC_FAILED`. Resource allocation failed "
-             "inside the cuSolver library. This is usually caused by a "
-             "cudaMalloc() failure.";
-    case CUSOLVER_STATUS_INVALID_VALUE:
-      return "`CUSOLVER_STATUS_INVALID_VALUE`. An unsupported value or "
-             "parameter was passed to the function (a negative vector size, "
-             "for example).";
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return "`CUSOLVER_STATUS_ARCH_MISMATCH`. The function requires a feature "
-             "absent from the device architecture; usually caused by the lack "
-             "of support for atomic operations or double precision.";
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return "`CUSOLVER_STATUS_EXECUTION_FAILED`. The GPU program failed to "
-             "execute. This is often caused by a launch failure of the kernel "
-             "on the GPU, which can be caused by multiple reasons.";
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return "`CUSOLVER_STATUS_INTERNAL_ERROR`. An internal cuSolver operation "
-             "failed. This error is usually caused by a cudaMemcpyAsync() "
-             "failure.";
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "`CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED`. The matrix type is "
-             "not supported by this function. This is usually caused by "
-             "passing an invalid matrix descriptor to the function.";
-    default:
-      return "Unknown cusolver status";
-  }
-}
-
 inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
-  std::string msg(" Cublas error, ");
-  return msg + cusolverGetErrorString(stat) + " ";
+  std::ostringstream sout;
+  sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat);
+  return sout.str();
 }
 
-/****** NCCL ERROR ******/
+/**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
   return nccl_result != ncclSuccess;
 }
 
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
-  std::string msg(" Nccl error, ");
+  std::ostringstream sout;
+  sout << "NCCL error(" << nccl_result << "), "
+       << platform::dynload::ncclGetErrorString(nccl_result) << ". ";
   if (errno == ENOSPC || errno == EAGAIN) {
     std::string detail(strerror(errno));
     detail += "\nPlease try one of the following solutions:";
@@ -947,42 +910,19 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
         "\n3. Increase shared memory by setting the -shm-size "
         "option when starting docker container, e.g., setting "
         " -shm-size=2g.\n";
-    return msg + platform::dynload::ncclGetErrorString(nccl_result) +
-           ", detail: " + detail + " ";
+    sout << " Detail: " + detail;
   }
-  return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
+  sout << GetExternalErrorMsg(nccl_result);
+  return sout.str();
 }
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
-namespace details {
-
-template <typename T>
-struct CudaStatusType {};
-
-#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
-  template <>                                        \
-  struct CudaStatusType<type> {                      \
-    using Type = type;                               \
-    static constexpr Type kSuccess = success_value;  \
-  }
-
-DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess);
-DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS);
-
-#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
-#endif
-}  // namespace details
-
 #define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
   do {                                                           \
     auto __cond__ = (COND);                                      \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);             \
     constexpr auto __success_type__ =                            \
-        ::paddle::platform::details::CudaStatusType<             \
+        ::paddle::platform::details::ExternalApiType<            \
             __CUDA_STATUS_TYPE__>::kSuccess;                     \
     if (UNLIKELY(__cond__ != __success_type__)) {                \
       auto __summary__ = ::paddle::platform::errors::External(   \
@@ -1023,7 +963,7 @@ inline void retry_sleep(unsigned milliseconds) {
     int retry_count = 1;                                                \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
     constexpr auto __success_type__ =                                   \
-        ::paddle::platform::details::CudaStatusType<                    \
+        ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
       retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
@@ -1037,10 +977,11 @@ inline void retry_sleep(unsigned milliseconds) {
     }                                                                   \
   } while (0)
 
-#undef DEFINE_CUDA_STATUS_TYPE
+#undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_CUDA
 
-/** HIP PADDLE ENFORCE FUNCTIONS AND MACROS **/
+/**************************************************************************/
+/***************************** HIP ERROR **********************************/
 #ifdef PADDLE_WITH_HIP
 
 /***** HIP ERROR *****/
@@ -1052,7 +993,7 @@ inline std::string build_rocm_error_msg(hipError_t e) {
   return sout.str();
 }
 
-/** HIPRAND ERROR **/
+/***** HIPRAND ERROR *****/
 inline bool is_error(hiprandStatus_t stat) {
   return stat != HIPRAND_STATUS_SUCCESS;
 }
@@ -1153,22 +1094,22 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
 namespace details {
 
 template <typename T>
-struct CudaStatusType {};
+struct ExternalApiType {};
 
-#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \
-  template <>                                        \
-  struct CudaStatusType<type> {                      \
-    using Type = type;                               \
-    static constexpr Type kSuccess = success_value;  \
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
   }
 
-DEFINE_CUDA_STATUS_TYPE(hipError_t, hipSuccess);
-DEFINE_CUDA_STATUS_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
-DEFINE_CUDA_STATUS_TYPE(miopenStatus_t, miopenStatusSuccess);
-DEFINE_CUDA_STATUS_TYPE(rocblas_status, rocblas_status_success);
+DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess);
+DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess);
+DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
@@ -1178,7 +1119,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
     auto __cond__ = (COND);                                    \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);           \
     constexpr auto __success_type__ =                          \
-        ::paddle::platform::details::CudaStatusType<           \
+        ::paddle::platform::details::ExternalApiType<          \
             __CUDA_STATUS_TYPE__>::kSuccess;                   \
     if (UNLIKELY(__cond__ != __success_type__)) {              \
       auto __summary__ = ::paddle::platform::errors::External( \
@@ -1201,7 +1142,7 @@ inline void retry_sleep(unsigned millisecond) {
     int retry_count = 1;                                                \
     using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
     constexpr auto __success_type__ =                                   \
-        ::paddle::platform::details::CudaStatusType<                    \
+        ::paddle::platform::details::ExternalApiType<                   \
             __CUDA_STATUS_TYPE__>::kSuccess;                            \
     while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
       retry_sleep(FLAGS_gpu_allocator_retry_time);                      \
@@ -1215,7 +1156,7 @@ inline void retry_sleep(unsigned millisecond) {
     }                                                                   \
   } while (0)
 
-#undef DEFINE_CUDA_STATUS_TYPE
+#undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_HIP
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 39f3d3f00c9997eea3f4ab1e5652fcc78f1be0a6..842d4cc139281aab48131759f63003b3fe3890c2 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) {
     return false;
   } catch (paddle::platform::EnforceNotMet& error) {
     std::string ex_msg = error.what();
+    std::cout << ex_msg << std::endl;
     return ex_msg.find(msg) != std::string::npos;
   }
 }
@@ -338,30 +339,98 @@ TEST(enforce, hip_success) {
 #else
 TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "CUDA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "CUDA error"));
+
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      cudaErrorInsufficientDriver,
+      "This indicates that the installed NVIDIA CUDA driver is older than the "
+      "CUDA runtime library. This is not a supported configuration.Users "
+      "should install an updated NVIDIA display driver to allow the "
+      "application to run"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      cudaErrorContextIsDestroyed,
+      "This error indicates that the context current to the calling thread has "
+      "been destroyed using cuCtxDestroy, or is a primary context which has "
+      "not yet been initialized"));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS));
   EXPECT_TRUE(
-      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error"));
+      CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "CURAND error"));
   EXPECT_TRUE(
-      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error"));
+      CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "CURAND error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CURAND_STATUS_ARCH_MISMATCH,
+      "Architecture mismatch, GPU does not support requested feature"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CURAND_STATUS_LENGTH_NOT_MULTIPLE,
+                             "Length requested is not a multple of dimension"));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS));
   EXPECT_TRUE(
-      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error"));
+      CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "CUDNN error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "CUDNN error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUDNN_STATUS_BAD_PARAM,
+      "An incorrect value or parameter was passed to the function. To correct, "
+      "ensure that all the parameters being passed have valid values"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUDNN_STATUS_LICENSE_ERROR,
+      "The functionality requested requires some license and an error was "
+      "detected when trying to check the current licensing. This error can "
+      "happen if the license is not present or is expired or if the "
+      "environment variable NVIDIA_LICENSE_FILE is not set properly"));
 
   EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS));
   EXPECT_TRUE(
-      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error"));
+      CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS error"));
+  EXPECT_TRUE(
+      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUBLAS_STATUS_EXECUTION_FAILED,
+      "The GPU program failed to execute. This is often caused by a launch "
+      "failure of the kernel on the GPU, which can be caused by multiple "
+      "reasons.  To correct: check that the hardware, an appropriate version "
+      "of the driver, and the cuBLAS library are correctly installed"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUBLAS_STATUS_MAPPING_ERROR,
+      "An access to GPU memory space failed, which is usually caused by a "
+      "failure to bind a texture. To correct: prior to the function call, "
+      "unbind any previously bound textures"));
+
+  EXPECT_TRUE(CheckCudaStatusSuccess(CUSOLVER_STATUS_SUCCESS));
+  EXPECT_TRUE(CheckCudaStatusFailure(CUSOLVER_STATUS_NOT_INITIALIZED,
+                                     "CUSOLVER error"));
   EXPECT_TRUE(
-      CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error"));
+      CheckCudaStatusFailure(CUSOLVER_STATUS_ALLOC_FAILED, "CUSOLVER error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUSOLVER_STATUS_INTERNAL_ERROR,
+      "An internal cuSolver operation failed. This error is usually caused by "
+      "a cudaMemcpyAsync() failure.To correct: check that the hardware, an "
+      "appropriate version of the driver, and the cuSolver library are "
+      "correctly installed. Also, check that the memory passed as a parameter "
+      "to the routine is not being deallocated prior to the routine’s "
+      "completion"));
+  EXPECT_TRUE(CheckCudaStatusFailure(
+      CUSOLVER_STATUS_INVALID_VALUE,
+      "An unsupported value or parameter was passed to the function (a "
+      "negative vector size, for example).To correct: ensure that all the "
+      "parameters being passed have valid values"));
+  /*
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
   EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error"));
-  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError,
+                                     "An internal check failed. This is either "
+                                     "a bug in NCCL or due to memory "
+                                     "corruption"));
+  EXPECT_TRUE(CheckCudaStatusFailure(ncclInvalidUsage,
+                                     "The call to NCCL is incorrect. This is "
+                                     "usually reflecting a programming error"));
 #endif
+*/
 }
 #endif
 #endif
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 0985b884d1daf727ccabf76a3040a1576f2f96b7..3a81cfab865c2835d02e031dc6b3d0128ecba2a9 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -40,7 +40,7 @@ class Event {
   // The DeviceContext is used to get the cuda stream.
   // If CPU profiling mode, can pass nullptr.
   Event(EventType type, std::string name, uint32_t thread_id,
-        EventRole role = EventRole::kOrdinary);
+        EventRole role = EventRole::kOrdinary, std::string attr = "none");
 
   const EventType& type() const;
   Event* parent() const { return parent_; }
@@ -50,7 +50,7 @@ class Event {
   uint32_t thread_id() const { return thread_id_; }
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
-
+  std::string attr() const { return attr_; }
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
@@ -69,6 +69,7 @@ class Event {
   EventRole role_{};
   int64_t cpu_ns_;
   bool visited_status_{false};
+  std::string attr_;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/external_error.proto
similarity index 58%
rename from paddle/fluid/platform/cuda_error.proto
rename to paddle/fluid/platform/external_error.proto
index b55e0af81ee6f8fb47d558287c7f902ef0fde81b..2094de7e10f69e98cc450d4221a85c6f904770ed 100644
--- a/paddle/fluid/platform/cuda_error.proto
+++ b/paddle/fluid/platform/external_error.proto
@@ -15,21 +15,32 @@ limitations under the License. */
 syntax = "proto2";
 package paddle.platform.proto;
 
+// (NOTE:zhouwei): ApiType describes which kind of external third party API
+// More external third party API can be added.
+enum ApiType {
+  CUDA = 0;
+  CURAND = 1;
+  CUDNN = 2;
+  CUBLAS = 3;
+  CUSOLVER = 4;
+  NCCL = 5;
+}
+
 message MessageDesc {
-  // Indicates the type of error
-  required int32 errorCode = 1;
+  // Indicates the code of error
+  required int32 code = 1;
   // Indicates the message of error
-  required string errorMessage = 2;
+  required string message = 2;
 }
 
 message AllMessageDesc {
-  // Version of cuda API
-  required int32 version = 1;
+  // Indicates which kind of third-party API
+  required ApiType type = 1;
   // Error messages of different errortype
-  repeated MessageDesc Messages = 2;
+  repeated MessageDesc messages = 2;
 }
 
-message cudaerrorDesc {
-  // Error messages of different cuda versions(9.0/10.0/10.2)
-  repeated AllMessageDesc AllMessages = 2;
+message ExternalErrorDesc {
+  // Error messages of different kind of external third party API
+  repeated AllMessageDesc errors = 1;
 }
\ No newline at end of file
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index d6563be48fe484cae5c54c52c87e5c3a1493e584..2981e5502ce6ac2d5cf55e8bf60a30035f032a3a 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -35,7 +35,8 @@ using user_function = std::function<std::shared_ptr<float>(const float*)>;
 using memory = mkldnn::memory;
 
 template <typename T, typename TForward,
-          typename TBackward = mkldnn_dummy_primitive>
+          typename TBackward = mkldnn_dummy_primitive,
+          typename TBackward_params = mkldnn_dummy_primitive>
 class MKLDNNHandlerT {
  public:
   MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
@@ -72,6 +73,21 @@ class MKLDNNHandlerT {
     return backward_p;
   }
 
+  std::shared_ptr<TBackward_params> AcquireBackwardWeightsPrimitive() {
+    const std::string key_p = key_ + "@bwd_w_p";
+    auto backward_p =
+        std::static_pointer_cast<TBackward_params>(dev_ctx_.GetBlob(key_p));
+    if (backward_p == nullptr) {
+      PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable(
+                                             "Error: BWD_PD should be set when "
+                                             "getting BWD prim witk key: %s .",
+                                             key_p));
+      backward_p = std::make_shared<TBackward_params>(*bwd_w_pd_);
+      dev_ctx_.SetBlob(key_p, backward_p);
+    }
+    return backward_p;
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
       const framework::Tensor* input) {
     const T* input_data = input->data<T>();
@@ -116,6 +132,29 @@ class MKLDNNHandlerT {
                                             "@diff_src_mem_p");
   }
 
+  // Buffer of given Tensor is used for oneDNN computation
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemory(
+      framework::Tensor* diff_weights) {
+    PADDLE_ENFORCE_NOT_NULL(
+        bwd_w_pd_,
+        platform::errors::Unavailable(
+            "Error: BWD_W_PD should be set when getting BWD grad of weights."));
+    T* ptr = diff_weights->mutable_data<T>(
+        place_, bwd_w_pd_->diff_weights_desc().get_size());
+    return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr,
+                                            "@diff_wei_mem_p");
+  }
+
+  // Buffer is allocated by oneDNN to store computation results
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemory(void) {
+    PADDLE_ENFORCE_NOT_NULL(
+        bwd_w_pd_,
+        platform::errors::Unavailable(
+            "Error: BWD_W_PD should be set when getting BWD grad of weights."));
+    return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(),
+                                            "@diff_wei_mem_p");
+  }
+
  protected:
   bool isCached() {
     const std::string key_pd = key_common_ + "@fwd_pd";
@@ -243,6 +282,27 @@ class MKLDNNHandlerT {
     }
   }
 
+  template <typename... Args>
+  void AcquireBackwardWeightsPrimitiveDescriptorNonBlocking(Args&&... args) {
+    // fwd_pd_ is set during grad by calling
+    // AcquireForwardPrimitiveDescriptorNonBlocking
+    PADDLE_ENFORCE_NOT_NULL(
+        fwd_pd_,
+        platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.",
+                                      key_ + "@fwd_pd"));
+    const std::string key_pd = key_ + "@bwd_w_pd";
+    bwd_w_pd_ =
+        std::static_pointer_cast<typename TBackward_params::primitive_desc>(
+            dev_ctx_.GetBlob(key_pd));
+    if (bwd_w_pd_ == nullptr) {
+      auto bwd_desc =
+          typename TBackward_params::desc(std::forward<Args>(args)...);
+      bwd_w_pd_ = std::make_shared<typename TBackward_params::primitive_desc>(
+          bwd_desc, engine_, *fwd_pd_);
+      dev_ctx_.SetBlob(key_pd, bwd_w_pd_);
+    }
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
       const std::string& suffix) {
     return std::static_pointer_cast<mkldnn::memory>(
@@ -370,6 +430,7 @@ class MKLDNNHandlerT {
   std::string key_;
   std::shared_ptr<typename TForward::primitive_desc> fwd_pd_;
   std::shared_ptr<typename TBackward::primitive_desc> bwd_pd_;
+  std::shared_ptr<typename TBackward_params::primitive_desc> bwd_w_pd_;
 };
 
 // TODO(grygielski) this class will be deleted later.
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index aef7f8648f8304d127e085364521cd9ded0fb85e..9c33233e1f79ac799d5acc2a711119d279a9613d 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -32,8 +32,12 @@ namespace platform {
 MemEvenRecorder MemEvenRecorder::recorder;
 
 Event::Event(EventType type, std::string name, uint32_t thread_id,
-             EventRole role)
-    : type_(type), name_(name), thread_id_(thread_id), role_(role) {
+             EventRole role, std::string attr)
+    : type_(type),
+      name_(name),
+      thread_id_(thread_id),
+      role_(role),
+      attr_(attr) {
   cpu_ns_ = GetTimeInNsec();
 }
 
@@ -52,7 +56,8 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
-RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
+RecordEvent::RecordEvent(const std::string &name, const EventRole role,
+                         const std::string attr) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -69,7 +74,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) {
   is_enabled_ = true;
   // lock is not needed, the code below is thread-safe
   // Maybe need the same push/pop behavior.
-  Event *e = PushEvent(name, role);
+  Event *e = PushEvent(name, role, attr);
   SetCurAnnotation(e);
   name_ = e->name();
 }
@@ -186,12 +191,14 @@ void Mark(const std::string &name) {
   GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-Event *PushEvent(const std::string &name, const EventRole role) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role);
+Event *PushEvent(const std::string &name, const EventRole role,
+                 std::string attr) {
+  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
+                               attr);
 }
 
-void PopEvent(const std::string &name, const EventRole role) {
-  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role);
+void PopEvent(const std::string &name, const EventRole role, std::string attr) {
+  GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
 }
 void EnableProfiler(ProfilerState state) {
   PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 2e802bf5ea303c4a4bb75492746b2434bd75f595..512bbc195b5b25dc2f707204b126bcee9af622c1 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -126,7 +126,8 @@ struct MemEvenRecorder {
 
 struct RecordEvent {
   RecordEvent(const std::string& name,
-              const EventRole role = EventRole::kOrdinary);
+              const EventRole role = EventRole::kOrdinary,
+              const std::string attr = "none");
 
   ~RecordEvent();
 
@@ -200,8 +201,10 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                   const Place& place, const std::string& annotation);
 void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
                  const Place& place, const std::string& annotation);
-Event* PushEvent(const std::string& name, const EventRole role);
-void PopEvent(const std::string& name, const EventRole role);
+Event* PushEvent(const std::string& name, const EventRole role,
+                 const std::string attr = "none");
+void PopEvent(const std::string& name, const EventRole role,
+              const std::string attr = "none");
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
 std::vector<std::vector<Event>> GetAllEvents();
diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc
index fa14ad4f63be084579951dc09eef5019f109364c..a6b542f53ae1785252b8993982345fd233902458 100644
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
@@ -58,6 +58,8 @@ void BindDistFleetWrapper(py::module* m) {
                                                           "DistFleetWrapper")
       .def(py::init([]() { return FleetWrapper::GetInstance(); }))
       .def("load_sparse", &FleetWrapper::LoadSparseOnServer)
+      .def("load_model", &FleetWrapper::LoadModel)
+      .def("load_one_table", &FleetWrapper::LoadModelOneTable)
       .def("init_server", &FleetWrapper::InitServer)
       .def("run_server",
            (uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer)
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 586cbda7ccfc57bf3eb1250dbe98499e152a88b9..68e6e049cdbb0cd508536741c4902143f65f8f76 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/platform/bfloat16.h"
@@ -402,8 +403,8 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
   auto out_dims = out->dims();
   auto in_dims = in->dims();
 
-  auto offsets = Eigen::array<int, D>();
-  auto extents = Eigen::array<int, D>();
+  auto offsets = Eigen::DSizes<Eigen::DenseIndex, D>();
+  auto extents = Eigen::DSizes<Eigen::DenseIndex, D>();
   for (size_t i = 0; i < D; ++i) {
     offsets[i] = 0;
     extents[i] = out_dims[i];
@@ -423,7 +424,8 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out,
   auto out_t =
       framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
           *out);
-  out_t.device(eigen_place) = in_t.slice(offsets, extents);
+  operators::EigenSlice<std::decay_t<decltype(eigen_place)>, T, D>::Eval(
+      eigen_place, out_t, in_t, offsets, extents);
 }
 
 template <typename T>
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index dd8146aa3a1147b2d4e77185647be720468043f7..8c323490cc964c5e3b69d6b512fdee22041d9803 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -76,6 +76,7 @@ if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37
 
 rem -------set cache build directory-----------
 rmdir build\python /s/q
+rmdir build\paddle\third_party\externalError /s/q
 rmdir build\paddle\fluid\pybind /s/q
 rmdir build\paddle_install_dir /s/q
 rmdir build\paddle_inference_install_dir /s/q
@@ -506,7 +507,6 @@ echo    ========================================
 echo    Step 4. Running unit tests ...
 echo    ========================================
 
-
 : set CI_SKIP_CPP_TEST if only *.py changed
 git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON
 
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ff3ded9f9ea56e611557323b577532a45182f911..47187871cf4c800f391ebfca7cb0c29fd1c85909 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1234,21 +1234,21 @@ set +x
                 fi
 
                 if [[ "$is_exclusive" != "" ]]; then
-                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$"
                     else
                         exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$"
                     fi
                 elif [[ "$is_multicard" != "" ]]; then
-                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$"
                     else
                         multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$"
                     fi
                 else
-                    if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then
+                    if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$"
-                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then
+                    elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then
                         single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$"
                     else
                         single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$"
@@ -1578,7 +1578,6 @@ set -x
 
     #analy h/cu to Map file
     python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'analy_h_cu_file' $tmp_dir ${PADDLE_ROOT}
-
     #generate ut map
     python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT}
     wait;
@@ -2075,6 +2074,34 @@ function summary_check_problems() {
     set -x
 }
 
+
+function reuse_so_cache() {
+    get_html="https://api.github.com/repos/PaddlePaddle/Paddle"
+    curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    merge_commit=`grep "sha" tmp.txt| awk -F \" 'NR==1{print $(NF-1)}'| sed 's# ##g'`
+    curl -X GET ${get_html}/commits/${merge_commit} -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    merge_pr=`grep -oP -m 1 '(#[0-9]*)' tmp.txt| sed 's/#//g'`
+    curl -X GET ${get_html}/pulls/${merge_pr}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt
+    pr_commit=`grep "sha" tmp.txt |tail -3|head -1|awk -F : '{print $NF}'|sed 's#"##g'|sed 's#,##g'| sed 's# ##g'`
+    set +e
+    wget -q https://xly-devops.bj.bcebos.com/PR/Paddle/${merge_pr}/${pr_commit}/workspace/Paddle/build/proto_so.tar.gz
+    down_proto_so=`echo $?`
+    set -e
+    if [ "${down_proto_so}" -eq 0 ];then
+        export CI_SKIP_CPP_TEST=ON
+        cd build && mv ../proto_so.tar.gz .
+        tar --use-compress-program=pigz -xpf proto_so.tar.gz
+        cmake_gen ${PYTHON_ABI:-""} ${parallel_number}
+        cd python
+        touch stub.cc
+        alias cp=cp
+        cp -r ../../python/paddle .
+        python setup.py bdist_wheel
+    else
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+    fi
+}
+
 function main() {
     local CMD=$1 
     local parallel_number=$2
@@ -2155,6 +2182,17 @@ function main() {
         check_coverage
         check_change_of_unittest ${PYTHON_ABI:-""}
         ;;
+      cpu_cicheck_coverage)
+        check_approvals_of_unittest 1
+        check_diff_file_for_coverage
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        enable_unused_var_check
+        ;;
+      gpu_cicheck_coverage)
+        parallel_test
+        check_coverage
+        check_change_of_unittest ${PYTHON_ABI:-""}
+        ;;
       ci_preciseTest)
         insert_pile_to_h_cu_diff 
         cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
@@ -2218,6 +2256,10 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      reuse_so_cicheck_py35)
+        reuse_so_cache
+        parallel_test
+        ;;
       cmake_gen)
         cmake_gen ${PYTHON_ABI:-""}
         ;;
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ee4dcaa8979407ee0bcfc4e02af42c08c05efb03..7bac330376c44fb9632258b81ccb00255ab33a7c 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -258,6 +258,7 @@ from .device import get_cudnn_version  # noqa: F401
 from .device import set_device  # noqa: F401
 from .device import get_device  # noqa: F401
 from .fluid.framework import is_compiled_with_cuda  # noqa: F401
+from .fluid.framework import is_compiled_with_rocm  # noqa: F401
 from .device import is_compiled_with_xpu  # noqa: F401
 from .device import is_compiled_with_npu  # noqa: F401
 from .device import XPUPlace  # noqa: F401
@@ -384,6 +385,7 @@ __all__ = [     #noqa
            'less_equal',
            'triu',
            'is_compiled_with_cuda',
+           'is_compiled_with_rocm',
            'sin',
            'dist',
            'unbind',
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 2a476f63862cfa2a41853d129bd6764df5292d3f..b712729f6420d019f1c1b5599f8e85cfef1a3765 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -25,6 +25,7 @@ import importlib
 import paddle.dataset
 import six.moves.cPickle as pickle
 import glob
+import paddle
 
 __all__ = []
 
@@ -95,16 +96,19 @@ def download(url, module_name, md5sum, save_name=None):
                     chunk_size = 4096
                     total_length = int(total_length)
                     total_iter = total_length / chunk_size + 1
-                    log_interval = total_iter / 20 if total_iter > 20 else 1
+                    log_interval = total_iter // 20 if total_iter > 20 else 1
                     log_index = 0
+                    bar = paddle.hapi.progressbar.ProgressBar(
+                        total_iter, name='item')
                     for data in r.iter_content(chunk_size=chunk_size):
                         if six.PY2:
                             data = six.b(data)
                         f.write(data)
                         log_index += 1
+                        bar.update(log_index, {})
                         if log_index % log_interval == 0:
-                            sys.stderr.write(".")
-                        sys.stdout.flush()
+                            bar.update(log_index)
+
         except Exception as e:
             # re-try
             continue
diff --git a/python/paddle/device.py b/python/paddle/device.py
index 803d54e11bea3fdfb069670c3f44548b52ce77ba..fce01d0d6751dca0076129ee5f9f4043b51ef09b 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -19,6 +19,7 @@ from paddle.fluid import core
 from paddle.fluid import framework
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.fluid.framework import is_compiled_with_cuda  #DEFINE_ALIAS
+from paddle.fluid.framework import is_compiled_with_rocm  #DEFINE_ALIAS
 
 __all__ = [
     'get_cudnn_version',
@@ -33,6 +34,7 @@ __all__ = [
     #            'CUDAPinnedPlace',
     #            'CUDAPlace',
     'is_compiled_with_cuda',
+    'is_compiled_with_rocm',
     'is_compiled_with_npu'
 ]
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 4f3a6f4768933d90782445edbc74f4f446a15a9b..e3b8d783b2ea5d7d555588edfda10dcb3d3115ff 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -239,31 +239,37 @@ def new_group(ranks=None, backend=None):
     if global_rank not in ranks:
         gp = Group(-1, -1, ring_id, ranks)
         _group_map[ring_id] = gp
-        return gp
-
-    ranks = sorted(ranks)
-    group_rank = ranks.index(global_rank)
-    group_size = len(ranks)
-    gp = Group(group_rank, group_size, ring_id, ranks)
-    _group_map[ring_id] = gp
-
-    if group_size < 2:
-        return gp
-
-    strategy = core.ParallelStrategy()
-    strategy.nranks = group_size
-    strategy.local_rank = group_rank
-    strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks]
-    strategy.current_endpoint = genv.current_endpoint
-    strategy.nrings = 1
-
-    if core.is_compiled_with_cuda():
-        place = core.CUDAPlace(genv.device_id)
-        core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id)
     else:
-        assert False, ("no cuda device found")
-    # need to barrier to construct group
-    barrier(gp)
+        ranks = sorted(ranks)
+        group_rank = ranks.index(global_rank)
+        group_size = len(ranks)
+        gp = Group(group_rank, group_size, ring_id, ranks)
+        _group_map[ring_id] = gp
+
+        if group_size >= 2:
+            strategy = core.ParallelStrategy()
+            strategy.nranks = group_size
+            strategy.local_rank = group_rank
+            strategy.trainer_endpoints = [
+                genv.trainer_endpoints[i] for i in ranks
+            ]
+            strategy.current_endpoint = genv.current_endpoint
+            strategy.nrings = 1
+
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(genv.device_id)
+                core.NCCLParallelContext(strategy,
+                                         place).init_with_ring_id(ring_id)
+            else:
+                assert False, ("no cuda device found")
+        else:
+            return gp
+
+    # TODO(shenliang03): This is a temporary solution to solve the problem of 
+    # hang caused by cross-creation of new_group
+    tmp = fill_constant([0], dtype="int32", value="1")
+    paddle.distributed.all_reduce(tmp, use_calc_stream=True)
+    paddle.distributed.wait(tmp)
     return gp
 
 
@@ -775,7 +781,7 @@ def _c_identity(tensor, group=None):
     return out
 
 
-def _c_concat(tensor, nranks, group=None):
+def _c_concat(tensor, group=None):
     """
     Return allgather of the tensor, mainly used with model parallel.
 
@@ -791,10 +797,14 @@ def _c_concat(tensor, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_concat(tensor, 'ring_id', ring_id, 'use_calc_stream',
-                                 True, 'nranks', nranks, 'use_model_parallel',
-                                 True)
+                                 True, 'rank', rank, 'nranks', nranks,
+                                 'use_model_parallel', True)
 
     op_type = 'c_concat'
     helper = LayerHelper(op_type, **locals())
@@ -812,12 +822,13 @@ def _c_concat(tensor, nranks, group=None):
             'ring_id': ring_id,
             'use_calc_stream': True,
             'use_model_parallel': True,
-            'nranks': nranks
+            'nranks': nranks,
+            'rank': rank
         })
     return out
 
 
-def _c_split(tensor, rank, nranks, group=None):
+def _c_split(tensor, group=None):
     """
     Split tensor evenly among all members, mainly used with model parallel.
 
@@ -834,6 +845,10 @@ def _c_split(tensor, rank, nranks, group=None):
         return
     ring_id = 0 if group is None else group.id
 
+    global_rank = _get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = _get_global_env().world_size if group is None else group.nranks
+
     if in_dygraph_mode():
         return core.ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
                                 ring_id, 'rank', rank, 'nranks', nranks,
@@ -883,6 +898,24 @@ def _mp_allreduce(tensor,
         raise NotImplementedError("No support _mp_allreduce in dygraph mode.")
 
 
+def _c_lookup_table(table, index, start_index=0, name=None):
+    """
+    Lookup table according to index.
+
+    Args:
+        table (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64.
+        index (Tensor): The index to lookup table.
+        start_index (int): The initial index for table range.
+        name (string): The name of the api
+
+    Returns:
+        Tensor.
+    """
+    if in_dygraph_mode():
+        return core.ops.c_embedding(table, index, "start_index", start_index)
+
+
 class _Linear(layers.Layer):
     """
     Linear
@@ -989,7 +1022,7 @@ def _parallel_linear(x,
 
     if axis == 0:
         if split_tensor:
-            x = _c_split(x, inner_rank, nranks, group=group)
+            x = _c_split(x, group=group)
     else:
         x = _c_identity(x, group=group)
 
@@ -1009,16 +1042,18 @@ def _parallel_linear(x,
             name=name)
 
     linear_out = linear(x)
-    startup_block = paddle.static.default_startup_program().global_block()
-    main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[linear.weight.name].is_distributed = True
-    main_block.vars[linear.weight.name].is_distributed = True
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(linear.weight.name).is_distributed = True
+    main_block._find_var_recursive(linear.weight.name).is_distributed = True
+
     # set is_distributed for splited bias
     # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
     # if a linear layer is splited by col, the bias would also be split into each rank as its weight
     if axis == 1 and linear._bias_attr != False:
-        startup_block.vars[linear.bias.name].is_distributed = True
-        main_block.vars[linear.bias.name].is_distributed = True
+        startup_block._find_var_recursive(
+            linear.bias.name).is_distributed = True
+        main_block._find_var_recursive(linear.bias.name).is_distributed = True
 
     if not gather_out: return linear_out
 
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 5f9a61371d34f4026bd1c6b8bef4fc8c626eb847..3186df7db581a54d0417b40892ea5f3e6c91721c 100644
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -77,6 +77,7 @@ stop_worker = fleet.stop_worker
 distributed_optimizer = fleet.distributed_optimizer
 save_inference_model = fleet.save_inference_model
 save_persistables = fleet.save_persistables
+load_model = fleet.load_model
 minimize = fleet.minimize
 distributed_model = fleet.distributed_model
 step = fleet.step
diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py
index 5e883f1ac6cc91fcc8c7ececf2df15cdf856aed6..9e5a31d6899e07a1dd9266082654304e18958ee2 100644
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
@@ -540,6 +540,29 @@ class Fleet(object):
         """
         self._runtime_handle._init_server(*args, **kwargs)
 
+    def load_model(self, path, mode):
+        """
+        load fleet model from path
+
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                fleet.init()
+
+                # build net
+                # fleet.distributed_optimizer(...)
+
+                fleet.load_model("path", "mode")
+
+        """
+        self._runtime_handle.load_model(path, mode)
+
     @is_non_distributed_check
     @inited_runtime_handler
     def run_server(self):
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
index ca3606c16e5d4740f90c78f455f7c15f307f9f8c..285647352dfbb0c8faf56900e3fd16ab5700950f 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -638,3 +638,8 @@ def append_naive_sync(block, sync_var, ring_id):
             'use_calc_stream': True,
             OP_ROLE_KEY: OpRole.Forward
         })
+    block.append_op(
+        type='c_sync_calc_stream',
+        inputs={'X': [sync_var]},
+        outputs={'Out': [sync_var]},
+        attrs={OP_ROLE_KEY: OpRole.Forward})
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 730a7430133e0658be87ed5c6d3f570400ff3ea9..91f9868f96ef90278b4d9340bf9d52c957ff5b1f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -43,14 +43,13 @@ class VocabParallelEmbedding(Layer):
         self.origin_num_embeddings = num_embeddings
         self.is_mp = (self.world_size > 1)
 
-        per_part_size = (
-            num_embeddings + self.world_size - 1) // self.world_size
-        last_part_size = num_embeddings - per_part_size * (self.world_size - 1)
-        if self.rank == self.world_size - 1:
-            per_part_size = last_part_size
-        per_part_size += 1  # make the last row as the padding index
-        self.per_part_size = per_part_size
+        assert num_embeddings % self.world_size == 0, (
+            "The length of the vocabulary must be divisible by the parallelism degree of MP"
+        )
+
+        per_part_size = num_embeddings // self.world_size
 
+        self.vocab_start_index = self.rank * per_part_size
         self._dtype = self._helper.get_default_dtype()
         self._size = [per_part_size, embedding_dim]
         self._weight_attr = weight_attr
@@ -63,49 +62,35 @@ class VocabParallelEmbedding(Layer):
                     shape=self._size,
                     dtype=self._dtype,
                     is_bias=False)
-            self.weight[per_part_size - 1] = 0.0
-            self.weight.is_distributed = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
-                shape=[num_embeddings, embedding_dim],
+                shape=self._size,
                 dtype=self._dtype,
                 is_bias=False)
 
+        self.weight.is_distributed = True
+
     def forward(self, x):
-        if not self.is_mp:
-            return F.embedding(
+        if self.is_mp:
+            output_parallel = paddle.distributed.collective._c_lookup_table(
+                self.weight,
+                x,
+                start_index=self.vocab_start_index,
+                name=self._name)
+            output = paddle.distributed.collective._mp_allreduce(
+                output_parallel,
+                group=self.model_parallel_group,
+                use_calc_stream=True,
+                use_model_parallel=True)
+        else:
+            output = F.embedding(
                 x,
                 weight=self.weight,
                 padding_idx=None,
                 sparse=False,
                 name=self._name)
-
-        origin_input_shape = x.shape
-        if len(origin_input_shape) == 2:
-            x = paddle.unsqueeze(x, axis=-1)
-        else:
-            assert origin_input_shape[-1] == 1, (
-                "The last dimension size of x must be 1.")
-        x_shard = paddle.shard_index(x, self.origin_num_embeddings,
-                                     self.world_size, self.rank,
-                                     self.per_part_size - 1)
-        if len(origin_input_shape) == 2:
-            x_shard = paddle.squeeze(x_shard, axis=-1)
-
-        emb_out = F.embedding(
-            x_shard,
-            weight=self.weight,
-            padding_idx=self.per_part_size - 1,
-            sparse=False,
-            name=self._name)
-
-        emb_out = paddle.distributed.collective._mp_allreduce(
-            emb_out,
-            group=self.model_parallel_group,
-            use_calc_stream=True,
-            use_model_parallel=True)
-        return emb_out
+        return output
 
 
 class ColumnParallelLinear(Layer):
@@ -175,9 +160,7 @@ class ColumnParallelLinear(Layer):
 
         if self.gather_output and self.is_mp:
             output = paddle.distributed.collective._c_concat(
-                output_parallel,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                output_parallel, group=self.model_parallel_group)
         else:
             output = output_parallel
         return output
@@ -245,10 +228,7 @@ class RowParallelLinear(Layer):
         else:
             # split last dim
             input_parallel = paddle.distributed.collective._c_split(
-                x,
-                rank=self.rank,
-                nranks=self.world_size,
-                group=self.model_parallel_group)
+                x, group=self.model_parallel_group)
 
         output_parallel = F.linear(input_parallel, self.weight, name=self._name)
 
diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py
index f18b82eaecd76a70bad4dec9aaf77dba34b2c158..642d0e427fa8c29c506e5fe60838169a0b241c86 100644
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
@@ -35,6 +35,23 @@ def conv_indent(indent):
 PSERVER_SAVE_SUFFIX = ".shard"
 
 
+def parse_table_class(varname, o_main_program):
+    from paddle.fluid.incubate.fleet.parameter_server.ir.public import is_distributed_sparse_op
+    from paddle.fluid.incubate.fleet.parameter_server.ir.public import is_sparse_op
+
+    for op in o_main_program.global_block().ops:
+        if not is_distributed_sparse_op(op) and not is_sparse_op(op):
+            continue
+
+        param_name = op.input("W")[0]
+
+        if param_name == varname and op.type == "lookup_table" or op.type == "lookup_table_v2":
+            if op.has_attr('table_class') and op.attr("table_class") != "none":
+                return op.attr('table_class')
+            else:
+                return "CommonSparseTable"
+
+
 class Accessor:
     def __init__(self):
         self.accessor_class = ""
@@ -723,13 +740,15 @@ class TheOnePSRuntime(RuntimeBase):
                     table.type = "PS_SPARSE_TABLE"
                     table.shard_num = 256
 
+                    common.table_name = self.compiled_strategy.grad_name_to_param_name[
+                        ctx.origin_varnames()[0]]
+
                     if self.compiled_strategy.is_geo_mode():
                         table.table_class = "SparseGeoTable"
                     else:
-                        table.table_class = "CommonSparseTable"
+                        table.table_class = parse_table_class(
+                            common.table_name, self.origin_main_program)
 
-                    common.table_name = self.compiled_strategy.grad_name_to_param_name[
-                        ctx.origin_varnames()[0]]
                 else:
                     table.type = "PS_DENSE_TABLE"
                     table.table_class = "CommonDenseTable"
@@ -1044,6 +1063,9 @@ class TheOnePSRuntime(RuntimeBase):
     def _save_persistables(self, *args, **kwargs):
         self._ps_inference_save_persistables(*args, **kwargs)
 
+    def load_model(self, path, mode):
+        self._worker.load_model(path, mode)
+
     def _shrink(self, threshold):
         import paddle.distributed.fleet as fleet
         fleet.util.barrier()
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py
old mode 100644
new mode 100755
index e58c8aa1625ddecc6f80810d0266958a75ea4956..78503baf2fd5d2833e557a8d4e2f7271545aeca7
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
@@ -97,10 +97,12 @@ class RecomputeFunction(PyLayer):
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
 
         # TODO support AMP
+        tracer = framework._dygraph_tracer()
+        ctx.is_fw_autocast = tracer._enable_autocast
+        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
         with paddle.no_grad():
             outputs = run_function(*args)
-
         return outputs
 
     @staticmethod
@@ -119,15 +121,23 @@ class RecomputeFunction(PyLayer):
             tracer = framework._dygraph_tracer()
             tracer._has_grad = True
 
-            # TODO support AMP
-
+            # NOTE support AMP
+            # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
                 with swith_rng_state(ctx.fw_cuda_rng_state):
+                    with paddle.amp.auto_cast(
+                            enable=ctx.is_fw_autocast,
+                            custom_white_list=ctx.amp_white_list,
+                            custom_black_list=ctx.amp_black_list):
+                        detached_inputs = detach_variable(tuple(inputs))
+                        outputs = ctx.run_function(*detached_inputs)
+            else:
+                with paddle.amp.auto_cast(
+                        enable=ctx.is_fw_autocast,
+                        custom_white_list=ctx.amp_white_list,
+                        custom_black_list=ctx.amp_black_list):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
-            else:
-                detached_inputs = detach_variable(tuple(inputs))
-                outputs = ctx.run_function(*detached_inputs)
 
             if isinstance(outputs, core.VarBase):
                 outputs = (outputs, )
@@ -155,7 +165,6 @@ class RecomputeFunction(PyLayer):
 
             grads = list(inp._grad_ivar() for inp in detached_inputs
                          if isinstance(inp, core.VarBase))
-
             return grads
 
 
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index c46672dca09e97dadf9b49cf6ab2dc44931ba83f..e21f142f10b36c647a3f24116c049869d62e68f2 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -89,6 +89,18 @@ def _options_valid_check(options):
                     % key)
 
 
+def _get_default_nprocs():
+    device = get_device()
+    if 'gpu' in device:
+        return core.get_cuda_device_count()
+    elif 'xpu' in device:
+        return core.get_xpu_device_count()
+    else:
+        raise RuntimeError(
+            "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".
+            format(device))
+
+
 def _get_node_ip(ips):
     node_ip = None
     node_ips = [x.strip() for x in ips.split(',')]
@@ -448,18 +460,7 @@ def spawn(func, args=(), nprocs=-1, join=True, daemon=False, **options):
 
     # get default nprocs
     if nprocs == -1:
-        device = get_device()
-        if device == 'cpu':
-            # TODO: not supports cpu parallel now
-            nprocs = _cpu_num()
-        elif device == 'gpu':
-            nprocs = core.get_cuda_device_count()
-        elif device == 'xpu':
-            nprocs = core.get_xpu_device_count()
-        else:
-            raise ValueError(
-                "`device` should be a string of `cpu`, 'gpu' or 'xpu', but got {}".
-                format(device))
+        nprocs = _get_default_nprocs()
 
     # NOTE(chenweihang): [ why need get cluster info before run? ]
     # when using `paddle.distributed.spawn` start parallel training, 
diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py
index 8c48033fc46f548d7f9605cee5eb3856cd8fc23e..30316b77adcdfc90e4aeb4d815a1c2de69887952 100644
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
@@ -967,6 +967,7 @@ def sparse_embedding(input,
                      padding_idx=None,
                      is_test=False,
                      entry=None,
+                     table_class="CommonSparseTable",
                      param_attr=None,
                      dtype='float32'):
     helper = LayerHelper('sparse_embedding', **locals())
@@ -989,6 +990,10 @@ def sparse_embedding(input,
     padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
         size[0] + padding_idx)
 
+    if table_class not in ["CommonSparseTable", "SSDSparseTable"]:
+        raise ValueError(
+            "table_class must be in [CommonSparseTable, SSDSparseTable]")
+
     entry_str = "none"
 
     if entry is not None:
@@ -1011,7 +1016,8 @@ def sparse_embedding(input,
             'is_distributed': True,
             'remote_prefetch': True,
             'is_test': is_test,
-            'entry': entry_str
+            'entry': entry_str,
+            'table_class': table_class
         })
 
     return tmp
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index c4b90565a0924e78ae15a52a88ea8ad7ab2736d0..249de87090ed8a50fa3e00cd4087b42636c70fa0 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -25,21 +25,21 @@ function(inference_analysis_python_api_int8_test_mkldnn target model_dir data_pa
     _inference_analysis_python_api_int8_test(${target} ${model_dir} ${data_path} ${filename} True)
 endfunction()
 
-function(download_quant_data install_dir data_file)
+function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_model install_dir data_file)
+function(download_quant_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_fp32_model install_dir data_file)
+function(download_quant_fp32_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file})
+	    inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models/fp32 ${data_file} ${check_sum})
     endif()
 endfunction()
 
@@ -86,15 +86,15 @@ function(inference_quant2_int8_nlp_test target quant_model_dir fp32_model_dir da
 		 --ops_to_quantize ${ops_to_quantize})
 endfunction()
 
-function(download_quant_data install_dir data_file)
+function(download_quant_data install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file})
+           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum})
     endif()
 endfunction()
 
-function(download_quant_model install_dir data_file)
+function(download_quant_model install_dir data_file check_sum)
     if (NOT EXISTS ${install_dir}/${data_file})
-           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file})
+           inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum})
     endif()
 endfunction()
 
@@ -149,43 +149,43 @@ if(LINUX AND WITH_MKLDNN)
 	# Quant ResNet50
 	set(QUANT_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant")
 	set(QUANT_RESNET50_MODEL_ARCHIVE "ResNet50_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_RESNET50_MODEL_DIR} ${QUANT_RESNET50_MODEL_ARCHIVE} ff89b934ab961c3a4a844193ece2e8a7)
 	inference_quant_int8_image_classification_test(test_quant_int8_resnet50_mkldnn ${QUANT_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant ResNet101
 	set(QUANT_RESNET101_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet101_quant")
 	set(QUANT_RESNET101_MODEL_ARCHIVE "ResNet101_qat_model.tar.gz")
-	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_RESNET101_MODEL_DIR} ${QUANT_RESNET101_MODEL_ARCHIVE} 95c6d01e3aeba31c13efb2ba8057d558)
 	# inference_quant_int8_image_classification_test(test_quant_int8_resnet101_mkldnn ${QUANT_RESNET101_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant GoogleNet
 	set(QUANT_GOOGLENET_MODEL_DIR "${QUANT_INSTALL_DIR}/GoogleNet_quant")
 	set(QUANT_GOOGLENET_MODEL_ARCHIVE "GoogleNet_qat_model.tar.gz")
-	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_GOOGLENET_MODEL_DIR} ${QUANT_GOOGLENET_MODEL_ARCHIVE} 1d4a7383baa63e7d1c423e8db2b791d5)
 	inference_quant_int8_image_classification_test(test_quant_int8_googlenet_mkldnn ${QUANT_GOOGLENET_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant MobileNetV1
 	set(QUANT_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant")
 	set(QUANT_MOBILENETV1_MODEL_ARCHIVE "MobileNetV1_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_MOBILENETV1_MODEL_DIR} ${QUANT_MOBILENETV1_MODEL_ARCHIVE} 3b774d94a9fcbb604d09bdb731fc1162)
 	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv1_mkldnn ${QUANT_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant MobileNetV2
 	set(QUANT_MOBILENETV2_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV2_quant")
 	set(QUANT_MOBILENETV2_MODEL_ARCHIVE "MobileNetV2_qat_model.tar.gz")
-	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_MOBILENETV2_MODEL_DIR} ${QUANT_MOBILENETV2_MODEL_ARCHIVE} 758a99d9225d8b73e1a8765883f96cdd)
 	inference_quant_int8_image_classification_test(test_quant_int8_mobilenetv2_mkldnn ${QUANT_MOBILENETV2_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant VGG16
 	set(QUANT_VGG16_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG16_quant")
 	set(QUANT_VGG16_MODEL_ARCHIVE "VGG16_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_VGG16_MODEL_DIR} ${QUANT_VGG16_MODEL_ARCHIVE} c37e63ca82a102f47be266f8068b0b55)
 	# inference_quant_int8_image_classification_test(test_quant_int8_vgg16_mkldnn ${QUANT_VGG16_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant VGG19
 	set(QUANT_VGG19_MODEL_DIR "${QUANT_INSTALL_DIR}/VGG19_quant")
 	set(QUANT_VGG19_MODEL_ARCHIVE "VGG19_qat_model.tar.gz")
-	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE})
+	download_quant_model(${QUANT_VGG19_MODEL_DIR} ${QUANT_VGG19_MODEL_ARCHIVE} 62bcd4b6c3ca2af67e8251d1c96ea18f)
 	# inference_quant_int8_image_classification_test(test_quant_int8_vgg19_mkldnn ${QUANT_VGG19_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	### Quant2 for image classification
@@ -194,7 +194,7 @@ if(LINUX AND WITH_MKLDNN)
 	# with weight scales in `fake_dequantize_max_abs` operators
         set(QUANT2_RESNET50_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2")
 	set(QUANT2_RESNET50_MODEL_ARCHIVE "ResNet50_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_MODEL_DIR} ${QUANT2_RESNET50_MODEL_ARCHIVE} e87309457e8c462a579340607f064d66)
 	set(FP32_RESNET50_MODEL_DIR "${INT8_INSTALL_DIR}/resnet50")
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_mkldnn ${QUANT2_RESNET50_MODEL_DIR}/ResNet50_qat_perf/float ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
@@ -202,20 +202,20 @@ if(LINUX AND WITH_MKLDNN)
 	# with weight scales in `fake_dequantize_max_abs` operators
 	set(QUANT2_RESNET50_RANGE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_range")
 	set(QUANT2_RESNET50_RANGE_MODEL_ARCHIVE "ResNet50_qat_range.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_RANGE_MODEL_DIR} ${QUANT2_RESNET50_RANGE_MODEL_ARCHIVE} 2fdc8a139f041c0d270abec826b2d304)
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_range_mkldnn ${QUANT2_RESNET50_RANGE_MODEL_DIR}/ResNet50_qat_range ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant2 ResNet50 with input/output scales in `fake_quantize_range_abs_max` operators and the `out_threshold` attributes,
 	# with weight scales in `fake_channel_wise_dequantize_max_abs` operators
 	set(QUANT2_RESNET50_CHANNELWISE_MODEL_DIR "${QUANT_INSTALL_DIR}/ResNet50_quant2_channelwise")
 	set(QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE "ResNet50_qat_channelwise.tar.gz")
-	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR} ${QUANT2_RESNET50_CHANNELWISE_MODEL_ARCHIVE} 887a1b1b0e9a4efd10f263a43764db26)
 	inference_quant2_int8_image_classification_test(test_quant2_int8_resnet50_channelwise_mkldnn ${QUANT2_RESNET50_CHANNELWISE_MODEL_DIR}/ResNet50_qat_channelwise ${FP32_RESNET50_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 
 	# Quant2 MobileNetV1
         set(QUANT2_MOBILENETV1_MODEL_DIR "${QUANT_INSTALL_DIR}/MobileNetV1_quant2")
 	set(QUANT2_MOBILENETV1_MODEL_ARCHIVE "MobileNet_qat_perf.tar.gz")
-	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_MOBILENETV1_MODEL_DIR} ${QUANT2_MOBILENETV1_MODEL_ARCHIVE} 7f626e453db2d56fed6c2538621ffacf)
 	set(FP32_MOBILENETV1_MODEL_DIR "${INT8_INSTALL_DIR}/mobilenetv1")
 	inference_quant2_int8_image_classification_test(test_quant2_int8_mobilenetv1_mkldnn ${QUANT2_MOBILENETV1_MODEL_DIR}/MobileNet_qat_perf/float ${FP32_MOBILENETV1_MODEL_DIR}/model ${IMAGENET_DATA_PATH})
 	
@@ -225,22 +225,22 @@ if(LINUX AND WITH_MKLDNN)
 	set(NLP_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_dataset")
 	set(NLP_DATA_PATH "${NLP_DATA_DIR}/Ernie_dataset/1.8w.bs1")
 	set(NLP_LABLES_PATH "${NLP_DATA_DIR}/Ernie_dataset/label.xnli.dev")
-	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE})
+	download_quant_data(${NLP_DATA_DIR} ${NLP_DATA_ARCHIVE} e650ce0cbc1fadbed5cc2c01d4e734dc)
 
 	# Quant2 Ernie
 	set(QUANT2_ERNIE_MODEL_ARCHIVE "ernie_qat.tar.gz")
 	set(QUANT2_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_quant2")
-	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_ERNIE_MODEL_DIR} ${QUANT2_ERNIE_MODEL_ARCHIVE} f7cdf4720755ecf66efbc8044e9922d9)
 	set(FP32_ERNIE_MODEL_ARCHIVE "ernie_fp32_model.tar.gz")
 	set(FP32_ERNIE_MODEL_DIR "${QUANT_INSTALL_DIR}/Ernie_float")
-	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE})
+	download_quant_fp32_model(${FP32_ERNIE_MODEL_DIR} ${FP32_ERNIE_MODEL_ARCHIVE} 114f38804a3ef8c45e7259e68bbd838b)
 	set(QUANT2_ERNIE_OPS_TO_QUANTIZE "fc,reshape2,transpose2,matmul,elementwise_add")
 	inference_quant2_int8_nlp_test(test_quant2_int8_ernie_mkldnn ${QUANT2_ERNIE_MODEL_DIR}/Ernie_qat/float ${FP32_ERNIE_MODEL_DIR}/ernie_fp32_model ${NLP_DATA_PATH} ${NLP_LABLES_PATH} ${QUANT2_ERNIE_OPS_TO_QUANTIZE})
 
 	# Quant2 GRU
 	set(QUANT2_GRU_MODEL_ARCHIVE "GRU_quant_acc.tar.gz")
 	set(QUANT2_GRU_MODEL_DIR "${QUANT_INSTALL_DIR}/GRU_quant2")
-	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE})
+	download_quant_model(${QUANT2_GRU_MODEL_DIR} ${QUANT2_GRU_MODEL_ARCHIVE} cf207f8076dcfb8b74d8b6bdddf9090c)
 	set(QUANT2_GRU_OPS_TO_QUANTIZE "multi_gru")
 
 	### Save FP32 model or INT8 model from Quant model
@@ -302,8 +302,8 @@ endforeach()
 # setting timeout value for old unittests
 if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_lstm_model PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY")
-    set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 400 LABELS "RUN_TYPE=NIGHTLY")
+    set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
+    set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY")
     set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
     set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
index 850b267411ed5d98d21f8dd0cc14ad76fd9b641c..f43b45553f5f0009b43da28683ef93cc70684210 100644
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
@@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32):
     return pool
 
 
-def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
+def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
     classdim = 10
     data_shape = [3, 32, 32]
     BATCH_SIZE = 32
@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
         # Test program
         test_program = train_program.clone(for_test=True)
 
-        if use_adam:
+        if optimizer == "Adam":
             optimizer = paddle.optimizer.AdamW(
                 learning_rate=0.001,
                 epsilon=1e-8,
                 weight_decay=0.0,
                 multi_precision=True)
+        elif optimizer == "Lars":
+            optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
+                learning_rate=0.001,
+                momentum=0.9,
+                multi_precision=use_pure_fp16)
         else:
             optimizer = paddle.optimizer.Momentum(
                 learning_rate=0.001,
@@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase):
         if not fluid.core.is_compiled_with_cuda():
             return
 
-        def do_test(use_nesterov=False, use_adam=False):
-            if use_adam:
+        def do_test(use_nesterov=False, optimizer=""):
+            if optimizer == "Adam":
                 suffix = "use Adam"
+            elif optimizer == "Lars":
+                suffix = "use Lars"
             else:
                 suffix = "with Nesterov" if use_nesterov else "without Nesterov"
             with self.scope_prog_guard():
@@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase):
                 train_loss_fp16, test_loss_fp16 = train(
                     use_pure_fp16=True,
                     use_nesterov=use_nesterov,
-                    use_adam=use_adam)
+                    optimizer=optimizer)
             with self.scope_prog_guard():
                 print("-----------------FP32 Train {}-----------------".format(
                     suffix))
                 train_loss_fp32, test_loss_fp32 = train(
                     use_pure_fp16=False,
                     use_nesterov=use_nesterov,
-                    use_adam=use_adam)
+                    optimizer=optimizer)
 
             self.assertTrue(
                 np.allclose(
@@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase):
 
         do_test(use_nesterov=False)
         do_test(use_nesterov=True)
-        do_test(use_adam=True)
+        do_test(optimizer="Adam")
+        do_test(optimizer="Lars")
 
     @contextlib.contextmanager
     def scope_prog_guard(self):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index fa168a62de11a9bebb2199924576e32685ed6513..29eee429ef66ab7e324d234b903d0e80510454b7 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -25,6 +25,7 @@ from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import Br
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakTransformOptimizer
 from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
 from paddle.fluid.dygraph.dygraph_to_static.cast_transformer import CastTransformer
+from paddle.fluid.dygraph.dygraph_to_static.grad_transformer import GradTransformer
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import IfElseTransformer
 from paddle.fluid.dygraph.dygraph_to_static.list_transformer import ListTransformer
 from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTransformer
@@ -86,6 +87,7 @@ class DygraphToStaticAst(gast.NodeTransformer):
             PrintTransformer,  # print statement
             CallTransformer,  # transform call recursively
             CastTransformer,  # type casting statement
+            GradTransformer,  # transform paddle.grad to paddle.gradients
         ]
 
         for index, transformer in enumerate(transformers):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a59063ae653fb45ad27f1525920d4dd986e24e
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import gast
+import warnings
+
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static import utils
+
+
+class GradTransformer(gast.NodeTransformer):
+    """
+    A class transforms dygraph paddle.grad to static graph paddle.gradients. The
+    transformation is applied to support double grad mode.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Input non-AstNodeWrapper node for the initialization of GradTransformer."
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_Call(self, node):
+        self.generic_visit(node)
+        if not is_grad_api_node(node):
+            return node
+
+        dygraph_grad_parameters = [
+            "outputs", "inputs", "grad_outputs", "retain_graph", "create_graph",
+            "only_inputs", "allow_unused", "no_grad_vars"
+        ]
+        to_static_grad_param = {
+            "outputs": "targets",
+            "inputs": "inputs",
+            "grad_outputs": "target_gradients",
+            "no_grad_vars": "no_grad_set"
+        }
+        static_keywords = []
+
+        for kw in node.keywords:
+            if kw.arg not in dygraph_grad_parameters or kw.arg not in to_static_grad_param:
+                warnings.warn("paddle.grad has unsupported parameter in jit: " +
+                              kw.arg + ", jit will discard it")
+                continue
+            dygraph_grad_parameters.remove(kw.arg)
+            kw.arg = to_static_grad_param[kw.arg]
+            static_keywords.append(kw)
+
+        for i in range(len(node.args)):
+            arg_name = dygraph_grad_parameters[i]
+            if arg_name not in to_static_grad_param:
+                warnings.warn("paddle.grad has unsupported parameter in jit: " +
+                              kw.arg + ", jit will discard it")
+                continue
+            kw = gast.keyword(
+                arg=to_static_grad_param[arg_name], value=node.args[i])
+            static_keywords.append(kw)
+
+        node.func = gast.parse('paddle.static.gradients').body[0].value
+        node.keywords = static_keywords
+        node.args = []
+        return node
+
+
+def is_grad_api_node(node):
+    assert isinstance(node, gast.Call)
+    api_name = utils.ast_to_source_code(node.func).strip()
+    if utils.is_paddle_api(node):
+        return api_name.endswith("grad")
+    return False
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
index de788487feabc7f01b8c26bbd62e4d9a595a34fd..5bc1c3d96d9c959fae0c39290bc531921e6022ec 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
@@ -402,7 +402,7 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
             var for var in _vars_with_store(child_dict) if var in parent_dict
         ])
 
-    def _vars_loaded_before_store(ids_dict):
+    def _vars_loaded(ids_dict):
         """
         gast.Param is also a kind of `load` semantic.
         """
@@ -411,8 +411,6 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
             for ctx in ctxs:
                 if isinstance(ctx, (gast.Load, gast.Param)):
                     new_dict[k].append(ctx)
-                elif isinstance(ctx, gast.Store):
-                    break
         return new_dict
 
     # modified vars
@@ -439,8 +437,12 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict,
     new_vars_in_body_and_orelse = body_new_vars & orelse_new_vars
 
     # 3. new var is created only in one of If.body or If.orelse node, and it used as gast.Load firstly after gast.If node.
+    # TODO(zhhsplendid): the _vars_loaded can be optimized as _vars_loaded_before_store. Because if a variable is stored before load,
+    # the value would change by the store statement, we don't have to return to change the value. However, analysis is
+    # complex because if the IfElse is nested and outer IfElse store statement may not run at all. We will put this optimization
+    # as the future TODO
     used_vars_after_ifelse = set(
-        [var for var in _vars_loaded_before_store(after_ifelse_vars_dict)])
+        [var for var in _vars_loaded(after_ifelse_vars_dict)])
     new_vars_to_create = new_vars_in_one_of_body_or_orelse & used_vars_after_ifelse | new_vars_in_body_and_orelse
 
     # 4. generate return_ids of if/else node.
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index feb8b0f9c9a16e2d418b12be0397ea11c890dfe7..6eea883226b36b7c1804214fc5e4c9a306c53d01 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -135,6 +135,7 @@ class PartialProgramLayer(layers.Layer):
         self._origin_main_program = self._verify_program(main_program)
         self._inner_scope = core.Scope()
         # Set default mode to train
+        self._double_grads = self._get_double_grads(self._origin_main_program)
         self.training = True
 
     @LazyInitialized
@@ -192,24 +193,44 @@ class PartialProgramLayer(layers.Layer):
         """
         required_params = []
         for param in self._params:
+            found_param = False
             for block in program.blocks:
-                if param.name in block.vars:
-                    required_params.append(param)
+                for op in block.ops:
+                    if param.name in op.input_arg_names or param.name in op.output_arg_names:
+                        required_params.append(param)
+                        found_param = True
+                        break
+                if found_param:
                     break
 
         self._params = required_params
 
+    def _get_double_grads(self, program):
+        double_grads = []
+        for block in program.blocks:
+            for name in block.vars:
+                if "@GRAD" in name:
+                    var_desc = block.vars[name].desc
+                    var_base = core.VarBase(var_desc.dtype(),
+                                            var_desc.shape(),
+                                            var_desc.name(),
+                                            var_desc.type(), False)
+                    double_grads.append(var_base)
+        return double_grads
+
     def forward(self, inputs):
         in_vars, out_vars, tmp_scope_vec = self._prepare(inputs)
-
         framework._dygraph_tracer().trace_op(
             type='run_program',
             inputs={
                 'X': valid_vars(in_vars),
                 'Params': valid_vars(self._params)
             },
-            outputs={'Out': valid_vars(out_vars),
-                     'OutScope': tmp_scope_vec},
+            outputs={
+                'Out': valid_vars(out_vars),
+                'OutScope': tmp_scope_vec,
+                'DOut': valid_vars(self._double_grads)
+            },
             attrs={
                 'global_block': self.program.desc.block(0),
                 'start_op_index': 0,
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index 33eb16f1b2b44cce8a979f062e2620a0f351e27f..d5ad3a88e8c2416e16d9d01437e5743d7b5d4834 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -166,29 +166,46 @@ def _get_loaded_var_new_old(program_desc, all_new_old_dict_all):
 
 def _rename_var_program_desc(program_desc, include=None, exclude=None):
     """
-    Change the name of the loaded variables.Use 'unique_name.generate' to avoid duplication
-    e.g. linear_0.tmp_3 ==> linear_0.tmp_1, x ==> x_0.
-    If 'include' is not `None`,variables that are not in include are not renamed.
-    If 'exclude' is not `None`,variables that are in exclude will are not renamed.
+    Change the name of the loaded variables.Use 'unique_name.generate' to avoid duplication.
+    It is used when loading multiple program during inference.
+
+    e.g. linear_0.tmp_3 ==> linear_0.tmp_1, x ==> x_0. For double grad, x@GRAD ==> x_0@GRAD
+    If 'include' is not `None`,variables in include and the corresponding
+      double grad variables (if exist) are renamed.
+    If 'exclude' is not `None`,variables that are in exclude and the
+      corresponding double grad variables (if exist) are not renamed.
 
     Args:
         program_desc(ProgramDesc):the variables in it will be modified.
         include(List):list of names of variables.
         exclude(List):list of names of variables.
+
+    Returns:
+        tuple of (dict_rename_var_new_old, dict_rename_var_old_new)
+        dict_rename_var_new_old is a dict mapping from new name to old name
+        dict_rename_var_old_new is a dict mapping from old name to new name
     """
     dict_rename_var_old_new = dict()
     dict_rename_var_new_old = dict()
     old_names = []
+    # Store all old names
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for var in cur_block.all_vars():
             old_names.append(var.name())
+
+    # Create dict_rename_var_new_old and dict_rename_var_old_new for non double
+    # grad variables
+    has_double_grad = False
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for var_idx, var in enumerate(cur_block.all_vars()):
             name_old = var.name()
+            is_double_grad_var = "@GRAD" in name_old
+            has_double_grad = has_double_grad or is_double_grad_var
             should_rename = (include is None or name_old in include) and (
-                exclude is None or name_old not in exclude)
+                exclude is None or
+                name_old not in exclude) and not is_double_grad_var
             if should_rename:
                 temp_name = name_old.split('_')
                 if len(temp_name) > 1 and temp_name[-1].isnumeric():
@@ -206,9 +223,29 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             if name_old != name_new:
                 cur_block._rename_var(
                     cpt.to_bytes(name_old), cpt.to_bytes(name_new))
-            dict_rename_var_old_new[name_old] = name_new
-            dict_rename_var_new_old[name_new] = name_old
-
+            if not is_double_grad_var:
+                dict_rename_var_old_new[name_old] = name_new
+                dict_rename_var_new_old[name_new] = name_old
+
+    # Handle double grad names
+    if has_double_grad:
+        double_grad_rename_dict = {}
+        for name_old in dict_rename_var_old_new:
+            for b_idx in six.moves.range(program_desc.num_blocks()):
+                cur_block = program_desc.block(b_idx)
+                for var_idx, var in enumerate(cur_block.all_vars()):
+                    var_name = var.name()
+                    if "@GRAD" in var_name and name_old in var_name:
+                        new_var_name = var_name.replace(
+                            name_old, dict_rename_var_old_new[name_old])
+                        double_grad_rename_dict[var_name] = new_var_name
+        for var_name in double_grad_rename_dict:
+            dict_rename_var_old_new[var_name] = double_grad_rename_dict[
+                var_name]
+            dict_rename_var_new_old[double_grad_rename_dict[
+                var_name]] = var_name
+
+    # Rename on program desc
     for b_idx in six.moves.range(program_desc.num_blocks()):
         cur_block = program_desc.block(b_idx)
         for op_idx in six.moves.range(cur_block.op_size()):
@@ -220,6 +257,11 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                         op._rename_input(
                             input_arg_name,
                             dict_rename_var_old_new[input_arg_name])
+                        if cur_block.has_var(cpt.to_bytes(input_arg_name)):
+                            cur_block._rename_var(
+                                cpt.to_bytes(input_arg_name),
+                                cpt.to_bytes(dict_rename_var_old_new[
+                                    input_arg_name]))
             for output_arg_name in op.output_arg_names():
                 if output_arg_name in dict_rename_var_old_new:
                     if output_arg_name != dict_rename_var_old_new[
@@ -227,6 +269,11 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                         op._rename_output(
                             output_arg_name,
                             dict_rename_var_old_new[output_arg_name])
+                        if cur_block.has_var(cpt.to_bytes(output_arg_name)):
+                            cur_block._rename_var(
+                                cpt.to_bytes(output_arg_name),
+                                cpt.to_bytes(dict_rename_var_old_new[
+                                    output_arg_name]))
     program_desc.flush()
     return dict_rename_var_new_old, dict_rename_var_old_new
 
@@ -267,9 +314,10 @@ class _ProgramHolder(object):
     def __init__(self, program_desc):
         super(_ProgramHolder, self).__init__()
 
-        # input, output, persistable var info
+        # input, output, persistable, double_grads var info
         self._input_descs = []
         self._output_descs = []
+        self._double_grad_descs = []
         self._persistable_names = []
 
         # execution scope
@@ -277,7 +325,6 @@ class _ProgramHolder(object):
 
         # append suffix var name dict
         self._suffix_varname_dict = None
-
         # forward program
         self._infer_program_desc = self._preprocess(program_desc)
         # forward + backward program
@@ -304,6 +351,10 @@ class _ProgramHolder(object):
     def persistable_names(self):
         return self._persistable_names
 
+    @property
+    def double_grad_descs(self):
+        return self._double_grad_descs
+
     @property
     def scope(self):
         return self._inner_scope
@@ -347,6 +398,12 @@ class _ProgramHolder(object):
         for op_idx in reversed(ops_to_remove):
             root_block._remove_op(op_idx, op_idx + 1)
 
+        for i in range(program_desc.num_blocks()):
+            block_desc = program_desc.block(i)
+            for var_desc in block_desc.all_vars():
+                if "@GRAD" in var_desc.name():
+                    self._double_grad_descs.append(var_desc)
+
         # 2. Input processing, reverse feed vars
         self._input_descs.reverse()
 
@@ -412,7 +469,6 @@ class _ProgramHolder(object):
         # rewrite a series of methods for append_backward for program_desc. 
         # Therefore, in order to reuse the method of backward.py, build the program here.
         program = _build_program_by_desc(program_desc_copy)
-
         # 3. Add the outputs which is only used for training and not saved in
         # inference program.
         for block_idx in six.moves.range(program.num_blocks):
@@ -738,6 +794,20 @@ def _run_dygraph(instance, input, program_holder):
                                  core.VarDesc.VarType.STEP_SCOPES, True)
     tmp_scope_vec.value().set_scope(program_holder.scope)
 
+    double_grad_vars = []
+    for var_desc in program_holder.double_grad_descs:
+        var = core.VarBase(var_desc.dtype(),
+                           var_desc.shape(),
+                           var_desc.name(), var_desc.type(), False)
+        double_grad_vars.append(var)
+    if len(double_grad_vars) == 0:
+        double_grad_vars = [
+            core.VarBase(
+                value=[1],
+                name='Fake_var',
+                place=framework._current_expected_place())
+        ]
+
     # 2. run program by op
     trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
     end_op_index = program_holder.infer_program.block(0).op_size()
@@ -745,8 +815,11 @@ def _run_dygraph(instance, input, program_holder):
         type='run_program',
         inputs={'X': input_vars,
                 'Params': persistable_vars},
-        outputs={'Out': output_vars,
-                 'OutScope': tmp_scope_vec},
+        outputs={
+            'Out': output_vars,
+            'OutScope': tmp_scope_vec,
+            'DOut': double_grad_vars
+        },
         attrs={
             'global_block': trace_program.block(0),
             'start_op_index': 0,
diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index e39fc3e23fe5648153df9bc1be270ed750ff6362..a014e0a722ab3280b2738b45de65ef29d189c5bd 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -46,9 +46,7 @@ _supported_promote_complex_types_ = [
     '__rsub__',
     '__mul__',
     '__rmul__',
-    '__div__',
     '__truediv__',
-    '__rdiv__',
     '__rtruediv__',
     '__matmul__',
 ]
@@ -168,9 +166,6 @@ def monkey_patch_math_varbase():
     def _scalar_mul_(var, value):
         return _scalar_elementwise_op_(var, value, 0.0)
 
-    def _scalar_div_(var, value):
-        return _scalar_elementwise_op_(var, 1.0 / value, 0.0)
-
     # for binary operator such as elementwise, compare
     def _binary_creator_(method_name,
                          op_type,
@@ -201,7 +196,10 @@ def monkey_patch_math_varbase():
                 if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
-                # but only +, -, *, / can use this method
+                # but only +, -, * can use this method
+                # NOTE(chentianyu03): / can not use `scale` method，because the result of
+                # `scale` method (self*(1/other_var)) do not exactly equal with the result 
+                # of `elementwise_div` method.
                 if scalar_method is not None:
                     return scalar_method(self, other_var)
             else:
@@ -288,12 +286,8 @@ def monkey_patch_math_varbase():
         ## a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
         ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                      None)),
+                                         False, None)),
         ('__rtruediv__', _binary_creator_('rtruediv__', 'elementwise_div', True,
                                           None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index a858ba783428e3d1c8870c87e51e474806c17042..bffeaf2c6c973ec7ff928eae6bac1c3fa2af5f50 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -53,6 +53,7 @@ __all__ = [
     'cuda_pinned_places',
     'in_dygraph_mode',
     'is_compiled_with_cuda',
+    'is_compiled_with_rocm',
     'is_compiled_with_xpu',
     'Variable',
     'require_version',
@@ -398,6 +399,21 @@ def is_compiled_with_cuda():
     return core.is_compiled_with_cuda()
 
 
+def is_compiled_with_rocm():
+    """
+    Whether this whl package can be used to run the model on AMD or Hygon GPU(ROCm).
+
+    Returns (bool): `True` if ROCm is currently available, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            support_gpu = paddle.is_compiled_with_rocm()
+    """
+    return core.is_compiled_with_rocm()
+
+
 def cuda_places(device_ids=None):
     """
     **Note**:
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index d4af3e2f8042a5dd83e5d4ba06cb3c89b352f8ed..89b2a8237dc65ab8ebd6b145c878e9da5501946d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -365,7 +365,41 @@ def ps_gpu_pass(program):
         for name in remove_var:
             program.global_block()._remove_var(name)
 
+    def _remove_optimizer_var(program):
+
+        embedding_w = {}
+        for idx, op in list(enumerate(program.global_block().ops)):
+            if op.type == "lookup_table_grad":
+                for name in op.input("W"):
+                    embedding_w[name] = 1
+
+        optimize_vars = []
+        optimize_op_role_vars = []
+        optimize_need_delete_vars = []
+        for op in _get_optimize_ops(program):
+            for name in op.input("Param"):
+                if name in embedding_w:
+                    optimize_op_role_vars.extend(op.attr("op_role_var"))
+                    for key_name in op.input_names:
+                        if key_name == "LearningRate":
+                            continue
+                        for var in op.input(key_name):
+                            optimize_vars.append(var)
+
+        optimize_vars = list(set(optimize_vars))
+        optimize_op_role_vars = list(set(optimize_op_role_vars))
+
+        for var in optimize_vars:
+            if var not in optimize_op_role_vars:
+                optimize_need_delete_vars.append(var)
+        need_delete_optimize_vars = list(set(optimize_need_delete_vars))
+
+        for name in need_delete_optimize_vars:
+            if program.global_block().has_var(name):
+                program.global_block()._remove_var(name)
+
     _add_push_box_sparse_op(program)
+    _remove_optimizer_var(program)
     _remove_lookup_table_grad_op_and_var(program)
     return program
 
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index e9738b6660eeaf935fbdfac8ce1b8921df2c7b02..c2de5670eb42c18621e1df815caae0b23f4e46f3 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -312,6 +312,10 @@ class LayerHelperBase(object):
         if not attr:
             return None
         assert isinstance(attr, ParamAttr)
+        for i, size in enumerate(shape):
+            assert size > 0, (
+                "Expected every dim's size to be larger than 0, "
+                "but the size of the {}-th dim is {}".format(i, size))
         # set global dtype
         if not dtype:
             dtype = self.__dtype
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index a2dee91dbef7c04340585cdf602afb48d04bc468..2a57c1a907aaccf2f1a511fb11b617cc11143606 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -39,9 +39,7 @@ EXPRESSION_MAP = {
     "__rsub__": "A -= B",
     "__mul__": "A * B",
     "__rmul__": "A *= B",
-    "__div__": "A / B",
     "__truediv__": "A / B",
-    "__rdiv__": "A /= B",
     "__rtruediv__": "A /= B",
     "__pow__": "A ** B",
     "__rpow__": "A **= B",
@@ -209,9 +207,6 @@ def monkey_patch_variable():
     def _scalar_mul_(var, value):
         return _scalar_op_(var, value, 0.0)
 
-    def _scalar_div_(var, value):
-        return _scalar_op_(var, 1.0 / value, 0.0)
-
     def _binary_creator_(method_name,
                          op_type,
                          reverse=False,
@@ -241,7 +236,10 @@ def monkey_patch_variable():
                 if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
                     self = astype(self, 'float32')
                 # here use `scale` replace `elementwise` to get better performance
-                # but only +, -, *, / can use this method
+                # but only +, -, * can use this method
+                # NOTE(chentianyu03): / can not use `scale` method，because the result of
+                # `scale` method (self*(1/other_var)) do not exactly equal with the result 
+                # of `elementwise_div` method.
                 if scalar_method is not None:
                     return scalar_method(self, other_var)
             else:
@@ -337,12 +335,8 @@ def monkey_patch_variable():
         #  a*b == b*a. Do not need to reverse explicitly
         ('__rmul__',
          _binary_creator_('__rmul__', 'elementwise_mul', False, _scalar_mul_)),
-        ('__div__', _binary_creator_('__div__', 'elementwise_div', False,
-                                     _scalar_div_)),
         ('__truediv__', _binary_creator_('__truediv__', 'elementwise_div',
-                                         False, _scalar_div_)),
-        ('__rdiv__', _binary_creator_('__rdiv__', 'elementwise_div', True,
-                                      None)),
+                                         False, None)),
         ('__rtruediv__', _binary_creator_('__rtruediv__', 'elementwise_div',
                                           True, None)),
         ('__pow__', _binary_creator_('__pow__', 'elementwise_pow', False,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index f87485c6a8f220f13bf3391de48a68a43b700cc5..ee08cb8654ec135a4efde03704ee0911d0fe18e1 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1502,6 +1502,9 @@ def conv2d(input,
 
     check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
                              'conv2d')
+    if len(input.shape) != 4:
+        raise ValueError("Input size should be 4, "
+                         "but received {}".format(len(input.shape)))
     num_channels = input.shape[1]
     if not isinstance(use_cudnn, bool):
         raise ValueError("Attr(use_cudnn) should be True or False. Received "
@@ -1520,6 +1523,20 @@ def conv2d(input,
             "Received: %s." % (str(input.shape), str(num_channels)))
     assert param_attr is not False, "param_attr should not be False here."
 
+    if groups is None:
+        num_filter_channels = num_channels
+    elif groups <= 0:
+        raise ValueError("the groups of input must be greater than 0, "
+                         "but received the groups of input is {}".format(
+                             groups))
+    else:
+        if num_channels % groups != 0:
+            raise ValueError(
+                "the channel of input must be divisible by groups,"
+                "received: the channel of input is {}, the shape of input is {}"
+                ", the groups is {}".format(num_channels, input.shape, groups))
+        num_filter_channels = num_channels // groups
+
     l_type = 'conv2d'
     if (num_channels == groups and num_filters % num_channels == 0 and
             not use_cudnn):
@@ -1532,16 +1549,6 @@ def conv2d(input,
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
 
-    if groups is None:
-        num_filter_channels = num_channels
-    else:
-        if num_channels % groups != 0:
-            raise ValueError(
-                "the channel of input must be divisible by groups,"
-                "received: the channel of input is {}, the shape of input is {}"
-                ", the groups is {}".format(num_channels, input.shape, groups))
-        num_filter_channels = num_channels // groups
-
     filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
     stride = utils.convert_to_list(stride, 2, 'stride')
     dilation = utils.convert_to_list(dilation, 2, 'dilation')
@@ -1597,6 +1604,11 @@ def conv2d(input,
 
     def _get_default_param_initializer():
         filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+        if filter_elem_num <= 0:
+            raise ValueError(
+                "Invalid filter number, excepted number is larger than 0, but"
+                " received {}, please check the input shape and "
+                "filter size.".format(filter_elem_num))
         std = (2.0 / filter_elem_num)**0.5
         return Normal(0.0, std, 0)
 
@@ -1878,6 +1890,12 @@ def conv3d(input,
     def _get_default_param_initializer():
         filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
             2] * num_channels
+        if filter_elem_num <= 0:
+            raise ValueError(
+                "Invalid filter number, excepted number is larger than 0, but"
+                " received {}, please check the input shape and "
+                "filter size.".format(filter_elem_num))
+
         std = (2.0 / filter_elem_num)**0.5
         return Normal(0.0, std, 0)
 
@@ -14412,6 +14430,11 @@ def deformable_conv(input,
 
     def _get_default_param_initializer():
         filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+        if filter_elem_num <= 0:
+            raise ValueError(
+                "Invalid filter number, excepted number is larger than 0, but"
+                " received {}, please check the input shape and "
+                "filter size.".format(filter_elem_num))
         std = (2.0 / filter_elem_num)**0.5
         return Normal(0.0, std, 0)
 
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 987918493d3b4aa1d313403cbb1aa6ffc8c8e6e9..a62217c628c302b7bb9e3f4aa62ee6fad17cb6bc 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -587,7 +587,8 @@ def assign(input, output=None):
     # after this api.
     if isinstance(input, (Variable, core.VarBase)):
         check_dtype(input.dtype, 'input', [
-            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64', 'bool'
+            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
+            'uint8', 'bool'
         ], 'assign', '(When the type of input in assign is Variable.)')
         if output is None:
             output = helper.create_variable_for_type_inference(
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index c0b93c83f78e125cf628031580550036fdc2692e..60d25a77c58dc05c9b52e35c617f95bc4647d277 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer):
             For details, please refer to :ref:`api_guide_Name`. Default is None.
         exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
         epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
+        multi_precision (bool, optional): Whether to use multi-precision during weight updating.
+        rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
+            before updating. Often choose to be `1.0/batch_size`.
         
     Examples:
         .. code-block:: python
@@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer):
                  grad_clip=None,
                  name=None,
                  exclude_from_weight_decay=None,
-                 epsilon=0):
+                 epsilon=0,
+                 multi_precision=False,
+                 rescale_grad=1.0):
         assert learning_rate is not None
         assert momentum is not None
         super(LarsMomentumOptimizer, self).__init__(
@@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer):
             self._exclude_from_weight_decay = []
         else:
             self._exclude_from_weight_decay = exclude_from_weight_decay
+        self._multi_precision = multi_precision
+        self._rescale_grad = float(rescale_grad)
+        self._master_weights = {}
+
+    def _create_master_weight(self, param):
+        assert isinstance(self.helper, LayerHelper)
+
+        var_name = param.name + '_fp32_master'
+        var_name = unique_name.generate(var_name)
+        var = layers.create_global_var(
+            name=var_name,
+            shape=param.shape,
+            value=0,
+            dtype='float32',
+            persistable=True)
+        block = self.helper.startup_program.global_block()
+        block.append_op(
+            type="cast",
+            inputs={"X": [param]},
+            outputs={"Out": [var]},
+            attrs={
+                "in_dtype": param.dtype,
+                "out_dtype": core.VarDesc.VarType.FP32
+            })
+        self._master_weights[param.name] = var
+        return var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+        Returns:
+            accumulator variable for the parameter
+        """
+        if self._name is not None:
+            name = self._name + "_" + name
+        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        target_param = self._master_weights[
+            param.name] if find_master else param
+        target_name = target_param.name
+        if (name not in self._accumulators or
+                target_name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, target_name))
+        return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
         for p in parameters:
+            if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
+                master_p = self._create_master_weight(p)
+                self._add_accumulator(self._velocity_acc_str, master_p)
+                continue
+            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+                warnings.warn(
+                    "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
+                    "Consider using multi_precision=True option of the Lars optimizer."
+                )
             self._add_accumulator(self._velocity_acc_str, p)
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
-
         _lars_weight_decay = self._lars_weight_decay
         param_name = param_and_grad[0].name
         if len(self._exclude_from_weight_decay) > 0:
@@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer):
 
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
+        lr = self._create_param_lr(param_and_grad)
+
+        find_master = self._multi_precision and param_and_grad[
+            0].dtype == core.VarDesc.VarType.FP16
+        master_weight = (self._master_weights[param_and_grad[0].name]
+                         if find_master else None)
+
+        attrs = {
+            "mu": self._momentum,
+            "lars_coeff": self._lars_coeff,
+            "lars_weight_decay": _lars_weight_decay,
+            "multi_precision": find_master,
+            "rescale_grad": self._rescale_grad
+        }
+
+        inputs = {
+            "Param": param_and_grad[0],
+            "Grad": param_and_grad[1],
+            "Velocity": velocity_acc,
+            "LearningRate": lr
+        }
+
+        outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc}
+
+        if find_master:
+            inputs["MasterParam"] = master_weight
+            outputs["MasterParamOut"] = master_weight
+
         # create the momentum optimize op
         momentum_op = block.append_op(
             type=self.type,
-            inputs={
-                "Param": param_and_grad[0],
-                "Grad": param_and_grad[1],
-                "Velocity": velocity_acc,
-                "LearningRate": self._create_param_lr(param_and_grad)
-            },
-            outputs={
-                "ParamOut": param_and_grad[0],
-                "VelocityOut": velocity_acc
-            },
-            attrs={
-                "mu": self._momentum,
-                "lars_coeff": self._lars_coeff,
-                "lars_weight_decay": _lars_weight_decay,
-                "epsilon": self._epsilon
-            },
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
             stop_gradient=True)
 
         return momentum_op
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index c4a256f0e193d750516baffb6184e273ce1ba246..85fbe001970ba7179691aa2853e53922f33944a1 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -80,6 +80,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_c_split)
     LIST(REMOVE_ITEM TEST_OPS test_allgather)
     LIST(REMOVE_ITEM TEST_OPS test_c_identity)
+    LIST(REMOVE_ITEM TEST_OPS test_c_embedding_op)
     LIST(REMOVE_ITEM TEST_OPS test_allreduce)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
@@ -477,6 +478,8 @@ py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_stat
 py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS
     FLAGS_cudnn_deterministic=1)
 set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+set_tests_properties(test_conv2d_api PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
 if(WITH_DISTRIBUTE)
     # FIXME(typhoonzero): add these tests back
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
@@ -574,7 +577,7 @@ endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
 # Coverage pipeline use cuda 10.1 now, profiler will random hang in cuda 10.1,
 # see https://github.com/PaddlePaddle/Paddle/issues/29082 for details.
-# We guess there are some bugs in cuda 10.1 or 10.2, 
+# We guess there are some bugs in cuda 10.1 or 10.2,
 # since this unittest is stable in cuda 11 (py3 pipeline) now.
 if(NOT WITH_COVERAGE)
   py_test_modules(test_parallel_executor_profiler MODULES test_parallel_executor_profiler)
@@ -599,8 +602,8 @@ py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_c
 py_test_modules(test_fuse_bn_add_act_pass MODULES test_fuse_bn_add_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 
 # NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
-# it is found that windows CI will run all the training unittests with the ON_INFER option turned on, 
-# which will not appear in other CIs. The calculation behavior of some ops in inference mode is 
+# it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
+# which will not appear in other CIs. The calculation behavior of some ops in inference mode is
 # inconsistent with that in non-inference mode.
 if(NOT ON_INFER)
     py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES test_parallel_executor_seresnext_base_cpu)
@@ -643,7 +646,7 @@ if (WITH_XPU)
     add_subdirectory(xpu)
 endif()
 
-# dist xpu tests: 
+# dist xpu tests:
 if (WITH_XPU_BKCL)
     py_test(test_collective_reduce_api_xpu SRCS "test_collective_reduce_api.py")
     py_test(test_collective_allreduce_api_xpu SRCS "test_collective_allreduce_api.py")
@@ -711,6 +714,7 @@ if (WITH_DISTRIBUTE)
     set_tests_properties(test_dist_fleet_ctr2 PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
     set_tests_properties(test_dist_fleet_infer PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_dist_fleet_raw_program_optimizer PROPERTIES TIMEOUT 120)
 endif()
 
 if (WITH_DISTRIBUTE AND NOT APPLE)
@@ -838,6 +842,8 @@ set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv2d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_op_depthwise_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
 set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120)
@@ -845,7 +851,7 @@ set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 270)
 set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
 set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
index 3ab93b38795865225479ea3bae1fb7b8be591194..2a8ee8bc72172565840867510f08bdcdfa1509a5 100644
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
@@ -230,6 +230,10 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             except fluid.core.EOFException:
                 self.reader.reset()
 
+        dirname = os.getenv("SAVE_DIRNAME", None)
+        if dirname:
+            fleet.save_persistables(exe, dirname=dirname)
+
         model_dir = tempfile.mkdtemp()
         fleet.save_inference_model(
             exe, model_dir, [feed.name for feed in self.feeds], self.avg_cost)
@@ -279,5 +283,9 @@ class TestDistCTR2x2(FleetDistRunnerBase):
             self.check_model_right(model_dir)
             shutil.rmtree(model_dir)
 
+        dirname = os.getenv("SAVE_DIRNAME", None)
+        if dirname:
+            fleet.save_persistables(exe, dirname=dirname)
+
 if __name__ == "__main__":
     runtime_main(TestDistCTR2x2)
diff --git a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..575c07390a35bbef00694a1e1c40bc0598e741ab
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_dist_base import TestDistRunnerBase, runtime_main
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+import numpy as np
+from functools import reduce
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+DTYPE = "float32"
+paddle.dataset.mnist.fetch()
+
+# Fix seed for test
+fluid.default_startup_program().random_seed = 1
+fluid.default_main_program().random_seed = 1
+
+
+def cnn_model(data):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=data,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu",
+        param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=0.01)))
+
+    SIZE = 10
+    input_shape = conv_pool_2.shape
+    param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE]
+    scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5
+
+    predict = fluid.layers.fc(
+        input=conv_pool_2,
+        size=SIZE,
+        act="softmax",
+        param_attr=fluid.param_attr.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.01)))
+    return predict
+
+
+class TestFleetMetaOptimizerPrecision(TestDistRunnerBase):
+    def get_model(self, batch_size=2, single_device=False):
+        # Input data
+        images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype=DTYPE)
+        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+        # Train program
+        predict = cnn_model(images)
+        cost = fluid.layers.cross_entropy(input=predict, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+
+        # Evaluator
+        batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
+        batch_acc = fluid.layers.accuracy(
+            input=predict, label=label, total=batch_size_tensor)
+
+        test_program = fluid.default_main_program().clone(for_test=True)
+
+        # Reader
+        train_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.mnist.test(), batch_size=batch_size)
+
+        optimizer = paddle.fluid.optimizer.Adam(0.01)
+        if single_device:
+            optimizer.minimize(avg_cost)
+        else:
+            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+            fleet.init(role)
+            strategy = paddle.distributed.fleet.DistributedStrategy()
+            strategy.without_graph_optimization = True
+            optimizer = fleet.distributed_optimizer(
+                optimizer, strategy=strategy)
+            optimizer.minimize(avg_cost)
+
+        return test_program, avg_cost, train_reader, test_reader, batch_acc, predict
+
+
+if __name__ == "__main__":
+    runtime_main(TestFleetMetaOptimizerPrecision)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab87beb9e1017200348753f2f31c52dcced405d7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import unittest
+
+
+class GradLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(GradLayer, self).__init__()
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        x.stop_gradient = False
+        y = x * x
+        dx = paddle.grad(outputs=[y], inputs=[x])[0]
+        return dx
+
+
+class GradLinearLayer(paddle.nn.Layer):
+    def __init__(self):
+        super(GradLinearLayer, self).__init__()
+        self.linear = paddle.nn.Linear(5, 5, bias_attr=False)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        x.stop_gradient = False
+        tmp = x + x
+        for i in range(10):
+            tmp = self.linear(tmp)
+        out = tmp
+        dx = paddle.grad(
+            [out], [x], None, create_graph=True, allow_unused=False)[0]
+        return dx
+
+
+class TestGrad(unittest.TestCase):
+    def setUp(self):
+        self.func = GradLayer()
+        self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
+        self.x.stop_gradient = False
+
+    def _run(self, func, to_static):
+        prog_trans = paddle.jit.ProgramTranslator()
+        prog_trans.enable(to_static)
+        ret = func(self.x).numpy()
+        prog_trans.enable(True)
+        return ret
+
+    def test_forward(self):
+        dygraph_res = self._run(self.func, to_static=False)
+        static_res = self._run(self.func, to_static=True)
+        self.assertTrue(np.allclose(static_res, dygraph_res))
+
+
+class TestGradLinear(TestGrad):
+    def setUp(self):
+        self.func = GradLinearLayer()
+        self.x = paddle.ones(shape=[10, 2, 5], dtype='float32')
+        self.x.stop_gradient = False
+
+    def test_save_infer_program(self):
+        path = "double_grad_infer_model"
+        input_spec = [
+            paddle.static.InputSpec(
+                shape=[10, 2, 5], dtype='float32')
+        ]
+        paddle.jit.save(self.func, path, input_spec=input_spec)
+        load_func = paddle.jit.load(path)
+
+        origin_res = self.func(self.x).numpy()
+        load_res = load_func(self.x).numpy()
+        self.assertTrue(np.allclose(origin_res, load_res))
+
+    def test_save_train_program(self):
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0)
+        optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                         grad_clip=grad_clip,
+                                         parameters=self.func.parameters())
+        for i in range(10):
+            out = self.func(self.x)
+            avg_loss = paddle.mean(paddle.abs(out - 1))
+            avg_loss.backward()
+            optimizer.minimize(avg_loss)
+            self.func.clear_gradients()
+
+        path = "double_grad_train_model"
+        paddle.jit.save(self.func, path)
+        load_func = paddle.jit.load(path)
+
+        origin_res = self.func(self.x).numpy()
+        load_res = load_func(self.x).numpy()
+        self.assertTrue(np.allclose(origin_res, load_res))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
index 349d5f82dbf545c321944c88dfce7d688c78ec89..e69cf7d267bccb4e302e904158c6f04b4287f9f0 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
@@ -212,7 +212,7 @@ class TestDistTraning(unittest.TestCase):
             optimizer_b.step()
 
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+                loss_a.numpy(), loss_b.numpy(), rtol=5e-6)
 
     def test_parallel_embedding(self):
         batch_size = 17
@@ -265,8 +265,9 @@ class TestDistTraning(unittest.TestCase):
 
             optimizer_a.step()
             optimizer_b.step()
-            np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
+            print(loss_a.numpy(), loss_b.numpy())
+
+            np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
index a9f251f3079cef2860c2599f6a8d33abf8da5fb8..f9ec49d88172a6476f1ba6c8620d621313673267 100644
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
@@ -32,14 +32,36 @@ def set_random_seed(seed, dp_id, rank_id):
     paddle.seed(seed + rank_id)
 
 
-vocab_size = 5
+vocab_size = 20
 hidden_size = 10
 inner_size = 8
-output_size = 2
+output_size = 10
 seq_length = 2
 batch_size = 4
 
 
+def parallel_matmul(lm_output, logit_weights, parallel_output):
+    hcg = fleet.get_hybrid_communicate_group()
+    model_parallel_group = hcg.get_model_parallel_group()
+    world_size = hcg.get_model_parallel_world_size()
+    rank = hcg.get_model_parallel_rank()
+
+    if world_size > 1:
+        input_parallel = paddle.distributed.collective._c_identity(
+            lm_output, group=model_parallel_group)
+
+        logits = paddle.matmul(input_parallel, logit_weights, transpose_y=True)
+
+        if parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(
+            logits, group=model_parallel_group)
+    else:
+        logits = paddle.matmul(lm_output, logit_weights, transpose_y=True)
+        return logits
+
+
 class SimpleMPNet(fluid.dygraph.Layer):
     def __init__(self, vocab_size, hidden_size, inner_size, output_size, np_fc1,
                  np_fc2, mp_id):
@@ -86,6 +108,7 @@ class SimpleMPNet(fluid.dygraph.Layer):
         x = self.linear1(x)
         x = self.linear2(x)
         x = self.linear3(x)
+        x = parallel_matmul(x, self.embedding.weight, False)
         return x
 
 
@@ -128,6 +151,7 @@ class SimpleDPNet(fluid.dygraph.Layer):
         x = self.linear1(x)
         x = self.linear2(x)
         x = self.linear3(x)
+        x = paddle.matmul(x, self.embedding.weight, transpose_y=True)
         return x
 
 
@@ -192,7 +216,7 @@ class TestDistMPTraning(unittest.TestCase):
             loss_b = self.train_batch(batch, model_b, optimizer_b, False)
 
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+                loss_a.numpy(), loss_b.numpy(), rtol=1e-6)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 8e4c091cd01dd3a7ee72957e3e6e3a7661ac8b19..0f068045e0c09c672d3bbc0a0ae0d04ff8d3223a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -8,6 +8,7 @@ foreach(TEST_INFERENCE_IR_PASS ${TEST_TRT_IR_PASSES})
 endforeach()
 
 if(WITH_GPU AND TENSORRT_FOUND)
+  list(REMOVE_ITEM TEST_TRT_IR_PASSES test_trt_multiclass_nms_op)
   foreach(target ${TEST_TRT_IR_PASSES})
     py_test_modules(${target} MODULES ${target})
   endforeach()
@@ -32,6 +33,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
 set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
-set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
+#set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
index ec3955a9ae1441cdaa4efa5b0e87ff8b74a0b689..ebbf724d0b4eadb3b1a2b81d71e7126b2ecd3f4d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
@@ -36,6 +36,7 @@ class TensorRTSubgraphPassConvTest(InferencePassTest):
                 groups=self.conv_groups,
                 padding=self.conv_padding,
                 bias_attr=False,
+                use_cudnn=self.use_cudnn,
                 act=None)
         self.feeds = {
             "data": np.random.random([1, 6, 64, 64]).astype("float32"),
@@ -50,6 +51,7 @@ class TensorRTSubgraphPassConvTest(InferencePassTest):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = [1, 1]
+        self.use_cudnn = True
 
     def test_check_output(self):
         if core.is_compiled_with_cuda():
@@ -65,6 +67,7 @@ class TensorRTSubgraphPassConvValidPaddingTest(TensorRTSubgraphPassConvTest):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = 'VALID'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
@@ -73,6 +76,7 @@ class TensorRTSubgraphPassConvSamePaddingTest(InferencePassTest):
         self.conv_filter_size = 6
         self.conv_groups = 3
         self.conv_padding = 'SAME'
+        self.use_cudnn = True
 
 
 class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
@@ -81,6 +85,16 @@ class TensorRTSubgraphPassDepthwiseConvTest(TensorRTSubgraphPassConvTest):
         self.conv_filter_size = 6
         self.conv_groups = 6
         self.conv_padding = [1, 1]
+        self.use_cudnn = False
+
+
+class TensorRTSubgraphPassDepthwiseConv2Test(TensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
 
 
 class TensorRTSubgraphPassConvTransposeTest(InferencePassTest):
@@ -151,6 +165,16 @@ class TensorRTSubgraphPassConvTransposeMultiGroupTest(
         self.use_cudnn = True
 
 
+class TensorRTSubgraphPassConvTranspose2Test(
+        TensorRTSubgraphPassConvTransposeTest):
+    def set_params(self):
+        self.conv_num_filters = 12
+        self.conv_filter_size = 4
+        self.conv_groups = 6
+        self.conv_padding = [1, 1]
+        self.use_cudnn = False
+
+
 class TensorRTSubgraphPassDepthwiseConvTransposeTest(
         TensorRTSubgraphPassConvTransposeTest):
     def set_params(self):
@@ -161,5 +185,70 @@ class TensorRTSubgraphPassDepthwiseConvTransposeTest(
         self.use_cudnn = False
 
 
+class DynamicShapeTensorRTSubgraphPassConvTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, -1, -1], dtype="float32")
+            conv_out = fluid.layers.conv2d(
+                input=data,
+                num_filters=self.conv_num_filters,
+                filter_size=self.conv_filter_size,
+                groups=self.conv_groups,
+                padding=self.conv_padding,
+                bias_attr=False,
+                use_cudnn=self.use_cudnn,
+                stride=self.stride,
+                act=None)
+        self.feeds = {
+            "data": np.random.random([32, 6, 64, 64]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = DynamicShapeTensorRTSubgraphPassConvTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = DynamicShapeTensorRTSubgraphPassConvTest.DynamicShapeParam(
+            {
+                "conv2d_0.tmp_0": [1, 6, 8, 8],
+                "data": [1, 6, 8, 8],
+                "depthwise_conv2d_0.tmp_0": [1, 6, 8, 8]
+            }, {
+                "conv2d_0.tmp_0": [32, 6, 64, 64],
+                "data": [32, 6, 64, 64],
+                "depthwise_conv2d_0.tmp_0": [32, 6, 64, 64]
+            }, {
+                "conv2d_0.tmp_0": [16, 6, 16, 16],
+                "data": [16, 6, 16, 16],
+                "depthwise_conv2d_0.tmp_0": [16, 6, 16, 16]
+            }, False)
+        self.fetch_list = [conv_out]
+
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = 'SAME'
+        self.use_cudnn = True
+        self.stride = [2, 2]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class DynamicShapeTensorRTSubgraphPassDepthwiseConvTransposeTest(
+        DynamicShapeTensorRTSubgraphPassConvTest):
+    def set_params(self):
+        self.conv_num_filters = 6
+        self.conv_filter_size = 6
+        self.conv_groups = 6
+        self.conv_padding = 'SAME'
+        self.use_cudnn = False
+        self.stride = [2, 2]
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..75f5328ac1c419015b379b08cc050f478a5b2112
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTGatherNdTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 3, 4], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 2, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([2, 3, 4]).astype("float32"),
+            "index":
+            np.array([[[0, 1], [1, 0]], [[1, 2], [0, 1]]]).astype("int32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdTest.DynamicShapeParam({
+            'data': [1, 3, 4],
+            'index': [1, 2, 2]
+        }, {'data': [3, 3, 4],
+            'index': [3, 2, 2]}, {'data': [3, 3, 4],
+                                  'index': [3, 2, 2]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTGatherNdFp16Test(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 5120, 768], dtype="float32")
+            index = fluid.data(name="index", shape=[-1, 4096, 2], dtype="int32")
+            gather_nd = fluid.layers.gather_nd(data, index)
+            out = fluid.layers.batch_norm(gather_nd, is_test=True)
+
+        index_data = np.zeros((1, 4096, 2), dtype='int32')
+        self.feeds = {
+            "data": np.random.random([1, 5120, 768]).astype("float32"),
+            "index": index_data,
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTGatherNdFp16Test.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTGatherNdFp16Test.DynamicShapeParam({
+            'data': [1, 5120, 768],
+            'index': [1, 4096, 2]
+        }, {'data': [3, 5120, 768],
+            'index':
+            [3, 4096, 2]}, {'data': [3, 5120, 768],
+                            'index': [3, 4096, 2]}, False)
+
+    def test_check_output(self, atol=1e-3):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb5e8e99b092690a2c88d75ba3a77de2dba2e720
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTReduceSumTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceSumAllTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 10, 768], dtype="float32")
+            reduce_sum = fluid.layers.reduce_sum(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_sum, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 10, 768]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceSumAllTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceSumAllTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [3, 3, 10, 768]}, {'data': [3, 3, 10, 768]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..90a6c482cdbbacdbbdb53a3bdca626b685f7a77f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTReshapeTest(InferencePassTest):
+    def setUp(self):
+        self.bs = 1
+        self.input_shape = [32, 15, 24]
+        self.reshape = [-1, 8, 20, 72]
+        self.data_shape = [
+            self.bs, self.input_shape[0], self.input_shape[1],
+            self.input_shape[2]
+        ]
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data', shape=self.data_shape, dtype='float32')
+            reshape_out = self.append_reshape(data, self.reshape)
+            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+        self.feeds = {
+            'data': np.random.random(self.data_shape).astype('float32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReshapeTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def append_reshape(self, data, reshape):
+        return fluid.layers.reshape(data, reshape)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReshapeTest1(TRTReshapeTest):
+    def setUp(self):
+        self.bs = 2
+        self.input_shape = [23, 13, 24]
+        self.reshape = [2, 0, -1, 12]
+        self.data_shape = [
+            self.bs, self.input_shape[0], self.input_shape[1],
+            self.input_shape[2]
+        ]
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data', shape=self.data_shape, dtype='float32')
+            reshape_out = self.append_reshape(data, self.reshape)
+            out = fluid.layers.batch_norm(reshape_out, is_test=True)
+        self.feeds = {
+            'data': np.random.random(self.data_shape).astype('float32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReshapeTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+
+class TRTReshapeTest2(TRTReshapeTest):
+    def setUp(self):
+        self.bs = 1
+        self.input_shape = [14, 48, 27]
+        self.reshape = [1, 24, 28, 0]
+        self.data_shape = [
+            self.bs, self.input_shape[0], self.input_shape[1],
+            self.input_shape[2]
+        ]
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name='data', shape=self.data_shape, dtype='float32')
+            bn_out = fluid.layers.batch_norm(data, is_test=True)
+            out = self.append_reshape(bn_out, self.reshape)
+        self.feeds = {
+            'data': np.random.random(self.data_shape).astype('float32'),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReshapeTest.TensorRTParam(
+            1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.dynamic_shape_params = TRTReshapeTest.DynamicShapeParam({
+            'data': [1, 3, 8, 8]
+        }, {'data': [5, 100, 100, 100]}, {'data': [1, 3, 16, 16]}, False)
+        self.fetch_list = [out]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
index 46bdbb1a420af2aeae3db430ed08f210e742bc26..d65919aa434c387348963e0a0ef00712ca91d549 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
@@ -32,7 +32,8 @@ class TestFusionLSTMBF16ONEDNNOp(OpTest):
     def test_check_output(self):
         for use_seq in {True, False}:
             self.attrs['use_seq'] = use_seq
-            self.check_output(check_dygraph=False, no_check_set=["Cell"])
+            self.check_output(
+                check_dygraph=False, no_check_set=["Cell"], atol=2e-2)
 
     def setUp(self):
         self.op_type = 'fusion_lstm'
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index a3b4242f39d36631b546e775693cc2187c64b14b..8d3a9baa787a03c5f9069ab9802542f29091e4e7 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -251,8 +251,8 @@ class TestNet(unittest.TestCase):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
@@ -335,8 +335,8 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
         cpu_pred, cpu_loss = self._test(False)
         npu_pred, npu_loss = self._test(True)
 
-        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-4))
-        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-4))
+        self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-3))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 654723d86299006b83d8f27d6090fb3e4c1082e2..9bf4d09cc36c350958f69b9cff25b5b4e95d21a1 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1191,7 +1191,9 @@ class OpTest(unittest.TestCase):
                         np.float32, np.float64
                 ]:
                     actual_t = convert_uint16_to_float(actual_t)
-                    atol = max(atol, 0.03)
+                    rtol = 1.e-2
+                else:
+                    rtol = 1.e-5
 
                 if expect_t.dtype == np.uint16 and actual_t.dtype == np.uint16:
                     expect_t = convert_uint16_to_float(expect_t)
@@ -1204,7 +1206,11 @@ class OpTest(unittest.TestCase):
 
                 self.assertTrue(
                     np.allclose(
-                        actual_t, expect_t, atol=atol, equal_nan=equal_nan),
+                        actual_t,
+                        expect_t,
+                        rtol=rtol,
+                        atol=atol,
+                        equal_nan=equal_nan),
                     "Output (" + out_name + ") has diff at " + str(place) +
                     "\nExpect " + str(expect_t) + "\n" + "But Got" +
                     str(actual_t) + " in class " + self.__class__.__name__)
diff --git a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
index 6c35d445b43b7beec5d4d58d29adecffc9a0325c..81b3e9bf34887e07b7472aa516c1da90242002d8 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -26,6 +26,28 @@ import gradient_checker
 from decorator_helper import prog_scope
 
 
+class TestSigmoidDoubleGradCheck(unittest.TestCase):
+    @prog_scope()
+    def func(self, place):
+        shape = [2, 3, 7, 9]
+        eps = 0.0005
+        dtype = np.float64
+        x = layers.data('x', shape, False, dtype=dtype)
+        x.persistable = True
+        y = layers.sigmoid(x)
+        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
+        x_arr[np.abs(x_arr) < 0.005] = 0.002
+        gradient_checker.double_grad_check(
+            [x], y, x_init=x_arr, place=place, eps=eps)
+
+    def test_grad(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 class TestTanhDoubleGradCheck(unittest.TestCase):
     @prog_scope()
     def func(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_adadelta_op.py b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
index 2c6c018b9dfac13d97c242e1f36adbddf9dbf3f1..44dd3d60bdca1af0c81373dae60689cd579d35ec 100644
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
@@ -127,6 +127,7 @@ class TestAdadeltaV2(unittest.TestCase):
         adam.clear_gradients()
 
     def test_adadelta(self):
+        paddle.enable_static()
         place = fluid.CPUPlace()
         main = fluid.Program()
         with fluid.program_guard(main):
@@ -159,5 +160,29 @@ class TestAdadeltaV2(unittest.TestCase):
             epsilon=None)
 
 
+class TestAdadeltaV2Group(TestAdadeltaV2):
+    def test_adadelta_dygraph(self):
+        paddle.disable_static(paddle.CPUPlace())
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 5)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adadelta(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001,
+            }],
+            weight_decay=0.1)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
index 0ccd42aa674dd410bdd2ea34a27929bede345332..c6a69c0723ce9142980ca3529c2c0c1fef7585c0 100644
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
@@ -37,5 +37,28 @@ class TestAdagradOpV2(unittest.TestCase):
         adagrad.clear_grad()
 
 
+class TestAdagradOpV2Group(TestAdagradOpV2):
+    def test_v20_coverage(self):
+        paddle.disable_static()
+        inp = paddle.rand(shape=[10, 10])
+        linear_1 = paddle.nn.Linear(10, 10)
+        linear_2 = paddle.nn.Linear(10, 10)
+        out = linear_1(inp)
+        out = linear_2(out)
+        loss = paddle.mean(out)
+        adagrad = paddle.optimizer.Adagrad(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001,
+            }],
+            weight_decay=0.1)
+        out.backward()
+        adagrad.step()
+        adagrad.clear_grad()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index 1e316c3383ea76f968868fbc7f90ccc898bc61a8..aea2a074aedd58a1152efbaa8d276f7d1c82387c 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -810,5 +810,31 @@ class TestNetWithEpsilonTensor(unittest.TestCase):
         paddle.enable_static()
 
 
+class TestAdamOpV2Group(TestAdamOpV2):
+    def test_adam_op(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001,
+                'beta1': 0.1,
+                'beta2': 0.99
+            }],
+            weight_decay=0.1)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamax_api.py b/python/paddle/fluid/tests/unittests/test_adamax_api.py
index 6d2ec0eefbb1c5157fdbcb5a2e04e97e918a95c9..57cb9d3cb5f7ddef60f6577ba0d8217ab3d16b45 100644
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
@@ -37,6 +37,7 @@ class TestAdamaxAPI(unittest.TestCase):
         adam.clear_gradients()
 
     def test_adamax_api(self):
+        paddle.enable_static()
         place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
         exe = fluid.Executor(place)
@@ -63,5 +64,31 @@ class TestAdamaxAPI(unittest.TestCase):
         assert rets[0] is not None
 
 
+class TestAdamaxAPIGroup(TestAdamaxAPI):
+    def test_adamax_api_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Adamax(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001,
+                'beta1': 0.1,
+                'beta2': 0.99
+            }],
+            weight_decay=0.1)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index 9b77dae1afed2d58601724fed033119cffe6a8e6..ce01ca042c123d17ae629c11a86cb38f123251b3 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -121,5 +121,31 @@ class TestAdamWOp(unittest.TestCase):
             adam.clear_gradients()
 
 
+class TestAdamWOpGroup(TestAdamWOp):
+    def test_adamw_op_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001
+            }],
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+
+        for _ in range(2):
+            out = linear_1(a)
+            out = linear_2(out)
+            out.backward()
+            adam.step()
+            adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index fe82b23b73bdb23da2dc30a083ac91f94a5ed1fd..694fd3c656107f1ebaeb79042036e3566229c53b 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -90,12 +90,9 @@ class TestAssignOpError(unittest.TestCase):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.CPUPlace())
             self.assertRaises(TypeError, fluid.layers.assign, x1)
-            # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, fluid.layers.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x4)
+            x2 = np.array([[2.5, 2.5]], dtype='uint8')
+            self.assertRaises(TypeError, fluid.layers.assign, x2)
 
 
 class TestAssignOApi(unittest.TestCase):
@@ -180,12 +177,9 @@ class TestAssignOpErrorApi(unittest.TestCase):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.CPUPlace())
             self.assertRaises(TypeError, paddle.assign, x1)
-            # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, paddle.assign, x3)
             # When the type of input is numpy.ndarray, the dtype of input must be float32, int32.
-            x4 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, paddle.assign, x4)
+            x2 = np.array([[2.5, 2.5]], dtype='uint8')
+            self.assertRaises(TypeError, paddle.assign, x2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index 287e85cb271f817b3581431a804685608b8cd91a..083b671c283a0f5fe0302837d60a414f3061632a 100755
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -19,6 +19,8 @@ import numpy as np
 from op_test import OpTest
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import paddle
+paddle.enable_static()
 
 
 def bilinear_interp_np(input,
diff --git a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0cae78ed2953837e98a8f69f2d5eaf20b4769aa
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+
+def get_c_embedding(start, end, table, ids):
+    index = ids.flatten()
+    input_mask = (index < start) | (index >= end)
+    masked_input = index - start
+    masked_input[input_mask] = 0
+    output = table[masked_input]
+    output[input_mask] = 0.0
+    return output
+
+
+class TestCEmbeddingOp(OpTest):
+    def setUp(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 31)).astype("float64")
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4, 5)).astype("int32")
+        self.start_index = 10
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 5, 31))}
+        self.attrs = {'start_index': self.start_index}
+
+    def test_check_output_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index f625e1de4a3e0564037d71e2393f5914415917d9..ea59e070cbd51da440d81a3eb2236edb38385f2b 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -167,12 +167,15 @@ half_run_server.run_ut()
         _python = sys.executable
 
         ps_cmd = "{} {}".format(_python, server_file)
+
         ps_proc = subprocess.Popen(
             ps_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE)
 
-        time.sleep(5)
+        outs, errs = ps_proc.communicate(timeout=15)
+
+        time.sleep(1)
 
         os.environ["TRAINING_ROLE"] = "TRAINER"
         os.environ["http_proxy"] = ""
@@ -180,6 +183,7 @@ half_run_server.run_ut()
 
         self.run_ut()
         ps_proc.kill()
+        ps_proc.wait()
 
         if os.path.exists(server_file):
             os.remove(server_file)
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_api.py b/python/paddle/fluid/tests/unittests/test_conv2d_api.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb7fd8fe1bc28b73023f6fe6a60598323884a464
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_api.py
@@ -0,0 +1,360 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+paddle.enable_static()
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+
+
+class TestConv2DAPI(unittest.TestCase):
+    def test_api(self):
+
+        input_NHWC = fluid.layers.data(
+            name="input_NHWC",
+            shape=[2, 5, 5, 3],
+            append_batch_size=False,
+            dtype="float32")
+
+        input_NCHW = fluid.layers.data(
+            name="input_NCHW",
+            shape=[2, 3, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+
+        fluid.layers.conv2d(
+            input=input_NHWC,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=0,
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        fluid.layers.conv2d(
+            input=input_NCHW,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[1, 2, 1, 0],
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        fluid.layers.conv2d(
+            input=input_NCHW,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        fluid.layers.conv2d(
+            input=input_NHWC,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+            dilation=[1, 1],
+            groups=1,
+            data_format="NHWC")
+
+        fluid.layers.conv2d(
+            input=input_NCHW,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding="SAME",
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+        fluid.layers.conv2d(
+            input=input_NCHW,
+            num_filters=3,
+            filter_size=[3, 3],
+            stride=[1, 1],
+            padding="VALID",
+            dilation=[1, 1],
+            groups=1,
+            data_format="NCHW")
+
+    def test_depthwise_conv2d(self):
+        x_var = paddle.uniform((2, 8, 8, 4), dtype='float32', min=-1., max=1.)
+        conv = paddle.nn.Conv2D(
+            in_channels=4,
+            out_channels=4,
+            kernel_size=(3, 3),
+            groups=4,
+            data_format='NHWC')
+        y_var = conv(x_var)
+
+
+class TestConv2DAPI_Error(unittest.TestCase):
+    def test_api(self):
+        input = fluid.layers.data(
+            name="input",
+            shape=[2, 5, 5, 5],
+            append_batch_size=False,
+            dtype="float32")
+
+        # ValueError: cudnn
+        def run_1():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=[0],
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_1)
+
+        # ValueError: data_format
+        def run_2():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=False,
+                data_format="NCHWC")
+
+        self.assertRaises(ValueError, run_2)
+
+        # ValueError: padding
+        def run_3():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding="SAMEE",
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_3)
+
+        def run_4():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_4)
+
+        def run_5():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=False,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_5)
+
+        # ValueError: channel dimmention
+        x = fluid.layers.data(
+            name="x",
+            shape=[2, 5, 5, -1],
+            append_batch_size=False,
+            dtype="float32")
+
+        def run_6():
+            fluid.layers.conv2d(
+                input=x,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                use_cudnn=False,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_6)
+
+        # ValueError: groups
+        def run_7():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=3,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=3,
+                use_cudnn=False,
+                data_format="NHWC")
+
+        self.assertRaises(ValueError, run_7)
+
+        # ValueError: filter num
+        def run_8():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=0,
+                filter_size=0,
+                stride=0,
+                padding=0,
+                dilation=0,
+                groups=1,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_8)
+
+        # ValueError: groups
+        def run_9():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=0,
+                filter_size=0,
+                stride=0,
+                padding=0,
+                dilation=0,
+                groups=0,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_9)
+
+        # ValueError: stride 
+        def run_10():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=1,
+                filter_size=1,
+                stride=0,
+                padding=0,
+                dilation=0,
+                groups=1,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_10)
+
+    def test_api_with_error_input(self):
+        input = fluid.layers.data(
+            name="error_input",
+            shape=[1],
+            append_batch_size=False,
+            dtype="float32")
+
+        # ValueError: cudnn
+        def run_1():
+            fluid.layers.conv2d(
+                input=input,
+                num_filters=0,
+                filter_size=0,
+                stride=0,
+                padding=0,
+                dilation=0,
+                groups=0,
+                use_cudnn=False,
+                data_format="NCHW")
+
+        self.assertRaises(ValueError, run_1)
+
+
+# --------- test environment variable ------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
+    "core is not compiled with CUDA or ROCM")
+class TestConv2DEnviron(unittest.TestCase):
+    def run1(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            inputs = fluid.layers.data(
+                shape=[2, 3, 5, 5],
+                append_batch_size=False,
+                name="inputs",
+                dtype="float32")
+            result = fluid.layers.conv2d(
+                input=inputs,
+                num_filters=4,
+                filter_size=[3, 3],
+                stride=[1, 1],
+                padding=0,
+                dilation=[1, 1],
+                groups=1,
+                data_format="NCHW")
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"inputs": self.input_np},
+                              fetch_list=[result])
+
+    def run2(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.nn.Conv2D(
+                in_channels=3,
+                out_channels=4,
+                kernel_size=(3, 3),
+                data_format="NCHW")
+            result = conv(inputs)
+
+    def run3(self, place):
+        with fluid.dygraph.guard(place):
+            inputs = fluid.dygraph.to_variable(self.input_np)
+            conv = paddle.fluid.dygraph.nn.Conv2D(
+                num_channels=3,
+                num_filters=4,
+                filter_size=(3, 3), )
+            result = conv(inputs)
+
+    def run_all(self, place):
+        self.run1(place)
+        self.run2(place)
+        self.run3(place)
+
+    def test_environ(self):
+        self.input_np = np.random.random([2, 3, 5, 5]).astype("float32")
+        for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]:
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
+            self.run_all(place)
+            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
+            self.run_all(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index f92a05158ce1ad1569321aae5218ab6031f26f53..f933d5bf7a48f14b0f4cb4f7ce274744f28c4c24 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -268,6 +268,12 @@ def add_error_cases(suite):
     suite.addTest(
         Conv2DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
+    suite.addTest(
+        Conv2DErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2, stride=0))
+    suite.addTest(
+        Conv2DErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2, padding=[-1, -1]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 77eac2fbd7fe04fcfd145558fe67c521136c776e..db05801c7227b03d7f7a06639abe9d3a779d5faf 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -20,7 +20,8 @@ import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from paddle.fluid.tests.unittests.testsuite import create_op
 from paddle.fluid import Program, program_guard
 
 
@@ -167,6 +168,52 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
     globals()[cls_name] = TestConv2DCUDNNFp16
 
 
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
+    )
+    class TestConv2DCUDNNBF16(parent):
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(scope, self.op_type, self.inputs, self.outputs,
+                           self.attrs)
+            return get_numeric_gradient(place, scope, op, self.inputs_fp32,
+                                        check_name, ['Output'])
+
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.no_need_check_grad = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Input')
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                no_grad_set=set(['Filter']),
+                user_defined_grads=[numeric_grads])
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Filter')
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                no_grad_set=set(['Input']),
+                user_defined_grads=[numeric_grads])
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
+    TestConv2DCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv2DCUDNNBF16
+
+
 def create_test_channel_last_class(parent):
     class TestChannelLastCase(parent):
         def init_data_format(self):
@@ -319,7 +366,15 @@ class TestConv2DOp(OpTest):
             'dilation': self.dilations
         }
 
-        input = np.random.random(self.input_size).astype(self.dtype)
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.uniform(-1, 1,
+                                       self.filter_size).astype(self.dtype)
+
         if not self.has_cuda():
             self.fuse_relu_before_depthwise_conv = False
         if self.fuse_relu_before_depthwise_conv:
@@ -329,16 +384,27 @@ class TestConv2DOp(OpTest):
             input2 = np.maximum(input, 0.0)
         else:
             input2 = input
-        filter = np.random.uniform(-1, 1, self.filter_size).astype(self.dtype)
 
         output, _, _, _, _ = conv2d_forward_naive(input2, filter, self.groups,
                                                   conv2d_param)
-        output = output.astype(self.dtype)
 
-        self.inputs = {
-            'Input': OpTest.np_dtype_to_fluid_dtype(input),
-            'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
-        }
+        if self.is_bfloat16_op():
+            output = output.astype(np.float32)
+            self.inputs = {
+                'Input': convert_float_to_uint16(input),
+                'Filter': convert_float_to_uint16(filter)
+            }
+            self.inputs_fp32 = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+            }
+        else:
+            output = output.astype(self.dtype)
+            self.inputs = {
+                'Input': OpTest.np_dtype_to_fluid_dtype(input),
+                'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
+            }
+
         self.attrs = {
             'strides': self.stride,
             'paddings': self.pad,
@@ -554,146 +620,14 @@ create_test_cudnn_fp16_class(TestWithGroup, grad_check=False)
 create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
 create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
 
-#----------------TestDepthwiseConv -----
-
-
-class TestDepthwiseConv(TestConv2DOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv2(TestConv2DOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv3(TestConv2DOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
+#----------------Conv2DCUDNN bf16----------------
 
-class TestDepthwiseConvWithDilation(TestConv2DOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation2(TestConv2DOp):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvandFuse(TestConv2DOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv2andFuse(TestConv2DOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConv3andFuse(TestConv2DOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilationandFuse(TestConv2DOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-
-class TestDepthwiseConvWithDilation2andFuse(TestConv2DOp):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
+create_test_cudnn_bf16_class(TestConv2DOp)
+create_test_cudnn_bf16_class(TestWithPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroup)
+create_test_cudnn_bf16_class(TestWith1x1)
+create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1)
 
 
 class TestCUDNNExhaustiveSearch(TestConv2DOp):
@@ -1016,183 +950,6 @@ create_test_cudnn_class(TestWithGroup_AsyPadding)
 create_test_cudnn_class(TestWith1x1_AsyPadding)
 create_test_cudnn_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-
-class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 1, 0, 1]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [0, 1, 0, 2]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 1, 0, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConvWithDilation_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 1, 2, 1]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConvWithDilation2_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [0, 1, 1, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [2, 1, 2, 3]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [12, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 1, 1, 2]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 2, 0, 2]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConvWithDilationandFuse_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [2, 2]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [2, 1, 1, 0]
-        self.padding_algorithm = "EXPLICIT"
-
-
-class TestDepthwiseConvWithDilation2andFuse_AsyPadding(TestConv2DOp_v2):
-    def init_test_case(self):
-        self.fuse_relu_before_depthwise_conv = True
-        self.use_cuda = True
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        self.groups = 3
-        self.dilations = [2, 2]
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [24, f_c, 3, 3]
-        self.op_type = "depthwise_conv2d"
-
-    def init_paddings(self):
-        self.pad = [1, 3, 1, 3]
-        self.padding_algorithm = "EXPLICIT"
-
-
 #---------- test SAME VALID -----------
 create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
 create_test_padding_SAME_class(TestWithPad_AsyPadding)
@@ -1218,18 +975,6 @@ create_test_cudnn_padding_VALID_class(TestWithStride_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithGroup_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-# depthwise conv2d
-
-create_test_padding_SAME_class(TestDepthwiseConv_AsyPadding)
-create_test_padding_SAME_class(TestDepthwiseConvWithDilation_AsyPadding)
-create_test_padding_SAME_class(TestDepthwiseConvandFuse_AsyPadding)
-create_test_padding_SAME_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
-
-create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
-create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
-create_test_padding_VALID_class(TestDepthwiseConvandFuse_AsyPadding)
-create_test_padding_VALID_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
-
 # ------------ test channel last ---------
 create_test_channel_last_class(TestConv2DOp_AsyPadding)
 create_test_channel_last_class(TestWithPad_AsyPadding)
@@ -1237,28 +982,12 @@ create_test_channel_last_class(TestWithGroup_AsyPadding)
 create_test_channel_last_class(TestWith1x1_AsyPadding)
 create_test_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-create_test_channel_last_class(TestDepthwiseConv_AsyPadding)
-create_test_channel_last_class(TestDepthwiseConvWithDilation2_AsyPadding)
-create_test_channel_last_class(TestDepthwiseConvandFuse_AsyPadding)
-create_test_channel_last_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
-
 create_test_cudnn_channel_last_class(TestConv2DOp_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithPad_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
-# ------------ depthwise conv2d in MIOPEN ---------
-if core.is_compiled_with_rocm():
-    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
-    create_test_cudnn_padding_SAME_class(
-        TestDepthwiseConvWithDilation_AsyPadding)
-    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
-    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
-    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
-    create_test_cudnn_channel_last_class(
-        TestDepthwiseConvWithDilation2_AsyPadding)
-
 create_test_cudnn_channel_last_fp16_class(
     TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
@@ -1270,271 +999,5 @@ create_test_cudnn_channel_last_fp16_class(
 create_test_cudnn_channel_last_fp16_class(
     TestWithDilation_AsyPadding, grad_check=False)
 
-
-# --------- test python API ---------------
-class TestConv2DAPI(unittest.TestCase):
-    def test_api(self):
-
-        input_NHWC = fluid.layers.data(
-            name="input_NHWC",
-            shape=[2, 5, 5, 3],
-            append_batch_size=False,
-            dtype="float32")
-
-        input_NCHW = fluid.layers.data(
-            name="input_NCHW",
-            shape=[2, 3, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
-
-        fluid.layers.conv2d(
-            input=input_NHWC,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=0,
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[1, 2, 1, 0],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[[0, 0], [0, 0], [1, 1], [1, 1]],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NHWC,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
-            dilation=[1, 1],
-            groups=1,
-            data_format="NHWC")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding="SAME",
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-        fluid.layers.conv2d(
-            input=input_NCHW,
-            num_filters=3,
-            filter_size=[3, 3],
-            stride=[1, 1],
-            padding="VALID",
-            dilation=[1, 1],
-            groups=1,
-            data_format="NCHW")
-
-    def test_depthwise_conv2d(self):
-        x_var = paddle.uniform((2, 8, 8, 4), dtype='float32', min=-1., max=1.)
-        conv = paddle.nn.Conv2D(
-            in_channels=4,
-            out_channels=4,
-            kernel_size=(3, 3),
-            groups=4,
-            data_format='NHWC')
-        y_var = conv(x_var)
-
-
-class TestConv2DAPI_Error(unittest.TestCase):
-    def test_api(self):
-        input = fluid.layers.data(
-            name="input",
-            shape=[2, 5, 5, 5],
-            append_batch_size=False,
-            dtype="float32")
-
-        # ValueError: cudnn
-        def run_1():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=[0],
-                data_format="NCHW")
-
-        self.assertRaises(ValueError, run_1)
-
-        # ValueError: data_format
-        def run_2():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHWC")
-
-        self.assertRaises(ValueError, run_2)
-
-        # ValueError: padding
-        def run_3():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding="SAMEE",
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
-
-        self.assertRaises(ValueError, run_3)
-
-        def run_4():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NCHW")
-
-        self.assertRaises(ValueError, run_4)
-
-        def run_5():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=[[0, 1], [0, 1], [0, 1], [0, 1]],
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NHWC")
-
-        self.assertRaises(ValueError, run_5)
-
-        # ValueError: channel dimmention
-        x = fluid.layers.data(
-            name="x",
-            shape=[2, 5, 5, -1],
-            append_batch_size=False,
-            dtype="float32")
-
-        def run_6():
-            fluid.layers.conv2d(
-                input=x,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                use_cudnn=False,
-                data_format="NHWC")
-
-        self.assertRaises(ValueError, run_6)
-
-        # ValueError: groups
-        def run_7():
-            fluid.layers.conv2d(
-                input=input,
-                num_filters=3,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=3,
-                use_cudnn=False,
-                data_format="NHWC")
-
-        self.assertRaises(ValueError, run_7)
-
-
-# --------- test environment variable ------
-@unittest.skipIf(
-    not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()),
-    "core is not compiled with CUDA or ROCM")
-class TestConv2DEnviron(unittest.TestCase):
-    def run1(self, place):
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            inputs = fluid.layers.data(
-                shape=[2, 3, 5, 5],
-                append_batch_size=False,
-                name="inputs",
-                dtype="float32")
-            result = fluid.layers.conv2d(
-                input=inputs,
-                num_filters=4,
-                filter_size=[3, 3],
-                stride=[1, 1],
-                padding=0,
-                dilation=[1, 1],
-                groups=1,
-                data_format="NCHW")
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            fetches = exe.run(fluid.default_main_program(),
-                              feed={"inputs": self.input_np},
-                              fetch_list=[result])
-
-    def run2(self, place):
-        with fluid.dygraph.guard(place):
-            inputs = fluid.dygraph.to_variable(self.input_np)
-            conv = paddle.nn.Conv2D(
-                in_channels=3,
-                out_channels=4,
-                kernel_size=(3, 3),
-                data_format="NCHW")
-            result = conv(inputs)
-
-    def run3(self, place):
-        with fluid.dygraph.guard(place):
-            inputs = fluid.dygraph.to_variable(self.input_np)
-            conv = paddle.fluid.dygraph.nn.Conv2D(
-                num_channels=3,
-                num_filters=4,
-                filter_size=(3, 3), )
-            result = conv(inputs)
-
-    def run_all(self, place):
-        self.run1(place)
-        self.run2(place)
-        self.run3(place)
-
-    def test_environ(self):
-        self.input_np = np.random.random([2, 3, 5, 5]).astype("float32")
-        for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]:
-            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False})
-            self.run_all(place)
-            fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True})
-            self.run_all(place)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py b/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b680c5a06be6956194eee53fe634b6a03c502b7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
@@ -0,0 +1,377 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+paddle.enable_static()
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+from paddle.fluid import Program, program_guard
+from test_conv2d_op import TestConv2DOp, TestConv2DOp_v2, create_test_padding_SAME_class, create_test_padding_VALID_class, create_test_channel_last_class, create_test_cudnn_padding_SAME_class, create_test_cudnn_channel_last_class
+
+#----------------TestDepthwiseConv -----
+
+
+class TestDepthwiseConv(TestConv2DOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2(TestConv2DOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3(TestConv2DOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation(TestConv2DOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation2(TestConv2DOp):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvandFuse(TestConv2DOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv2andFuse(TestConv2DOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv3andFuse(TestConv2DOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilationandFuse(TestConv2DOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConvWithDilation2andFuse(TestConv2DOp):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+
+class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 0, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [0, 1, 0, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvWithDilation_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvWithDilation2_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [2, 1, 2, 3]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [12, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 1, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 2, 0, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvWithDilationandFuse_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [2, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestDepthwiseConvWithDilation2andFuse_AsyPadding(TestConv2DOp_v2):
+    def init_test_case(self):
+        self.fuse_relu_before_depthwise_conv = True
+        self.use_cuda = True
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        self.groups = 3
+        self.dilations = [2, 2]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [24, f_c, 3, 3]
+        self.op_type = "depthwise_conv2d"
+
+    def init_paddings(self):
+        self.pad = [1, 3, 1, 3]
+        self.padding_algorithm = "EXPLICIT"
+
+
+# depthwise conv2d
+
+create_test_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+create_test_padding_SAME_class(TestDepthwiseConvWithDilation_AsyPadding)
+create_test_padding_SAME_class(TestDepthwiseConvandFuse_AsyPadding)
+create_test_padding_SAME_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
+
+create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+create_test_padding_VALID_class(TestDepthwiseConvandFuse_AsyPadding)
+create_test_padding_VALID_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
+
+# channel last
+
+create_test_channel_last_class(TestDepthwiseConv_AsyPadding)
+create_test_channel_last_class(TestDepthwiseConvWithDilation2_AsyPadding)
+create_test_channel_last_class(TestDepthwiseConvandFuse_AsyPadding)
+create_test_channel_last_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
+
+# ------------ depthwise conv2d in MIOPEN ---------
+if core.is_compiled_with_rocm():
+    create_test_cudnn_padding_SAME_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_padding_SAME_class(
+        TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConv_AsyPadding)
+    create_test_padding_VALID_class(TestDepthwiseConvWithDilation_AsyPadding)
+    create_test_cudnn_channel_last_class(TestDepthwiseConv_AsyPadding)
+    create_test_cudnn_channel_last_class(
+        TestDepthwiseConvWithDilation2_AsyPadding)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index b45e2d1a6aa1456f6ce9efa45f0c9f88ba55fa07..707991352fa5e160ca651e55b5e1ec0caa04f2f4 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -221,6 +221,9 @@ def add_error_cases(suite):
     suite.addTest(
         Conv3DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
+    suite.addTest(
+        Conv3DErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2, padding=[-1, 1, 3]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 59d1f3216e17e114b8b51e9cfef62a6ff45663c4..5f23d04dde51cc21e66098ee6e37027bf82d7537 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -984,6 +984,21 @@ class TestConv3DAPI_Error(unittest.TestCase):
 
         self.assertRaises(ValueError, run_7)
 
+        # ValueError: filter num
+        def run_8():
+            fluid.layers.conv3d(
+                input=input,
+                num_filters=0,
+                filter_size=0,
+                stride=0,
+                padding=0,
+                dilation=0,
+                groups=1,
+                use_cudnn=False,
+                data_format="NDHWC")
+
+        self.assertRaises(ValueError, run_8)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index a567ec727389366e020441e336c12c4395d8e056..19249fcfeb3a6044b6ad4bb5f17fd4dbdd693a65 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -238,6 +238,9 @@ def add_error_cases(suite):
     suite.addTest(
         Conv3DTransposeErrorTestCase(
             methodName='runTest', output_size="not_valid"))
+    suite.addTest(
+        Conv3DTransposeErrorTestCase(
+            methodName='runTest', num_channels=5, groups=2, padding=[-1, 1, 3]))
 
 
 def load_tests(loader, standard_tests, pattern):
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
index 80c10886826e7d5fe2a04efb53cfb12e383b11b5..13624d189f72b61f1e042d0353e594add08a5ce7 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
@@ -285,6 +285,19 @@ class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
 
         self.assertRaises(TypeError, test_invalid_offset)
 
+        def test_invalid_filter():
+            paddle.enable_static()
+            input = fluid.data(
+                name='input_filter', shape=[None, 3, 32, 32], dtype='float32')
+            offset = fluid.data(
+                name='offset_filter', shape=[None, 3, 32, 32], dtype='float32')
+            mask = fluid.data(
+                name='mask_filter', shape=[None, 3, 32, 32], dtype='float32')
+            loss = fluid.layers.deformable_conv(
+                input, offset, mask, num_filters=4, filter_size=0)
+
+        self.assertRaises(ValueError, test_invalid_filter)
+
 
 class TestDeformConv2DAPI(unittest.TestCase):
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index edc510e4e766d7f1e8898c831204806b0b8f954d..78b06bd5333d79b4aa90d00f1c1f16a399e61929 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -186,6 +186,76 @@ class TestDistRunnerBase(object):
             fleet.save_inference_model(exe, infer_save_dir_fleet,
                                        feeded_var_names, [avg_cost])
 
+    def run_use_fleet_api_20_trainer(self, args):
+        """
+        1. remove codes for DistributedStrategy and leave the DistributedStrategy part to get_model()
+        2. to run with fleet 2.0 api, set flags _use_fleet_api and _use_fleet_api_20 to True
+        3. for now, not support test for model save
+        """
+        assert args.update_method == "nccl2" or "bkcl"
+
+        self.lr = args.lr
+        print_to_err("use_fleet 2.0", "fleet.node_num:")
+
+        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
+            self.get_model(batch_size=args.batch_size)
+
+        if fluid.core.is_compiled_with_cuda():
+            device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
+            place = fluid.CUDAPlace(device_id)
+        elif fluid.core.is_compiled_with_xpu():
+            device_id = int(os.getenv("FLAGS_selected_xpus", "0"))
+            place = fluid.XPUPlace(device_id)
+        else:
+            raise ValueError(
+                "fleet dygraph api must in paddlepaddle-xpu or paddlepaddle-gpu."
+            )
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        eprint(type(self).__name__, "run worker startup program done.")
+
+        feed_var_list = [
+            var
+            for var in fluid.default_main_program().global_block().vars.values()
+            if var.is_data
+        ]
+
+        eprint("feed_var_list:", feed_var_list)
+
+        if feed_var_list[0].name == 'label':
+            feed_var_list = feed_var_list[::-1]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        reader_generator = train_reader()
+
+        def get_data():
+            origin_batch = next(reader_generator)
+            if args.update_method != "local" and args.use_reader_alloc:
+                new_batch = []
+                for offset, item in enumerate(origin_batch):
+                    if offset % 2 == args.trainer_id:
+                        new_batch.append(item)
+                return new_batch
+            else:
+                return origin_batch
+
+        print_to_err(type(self).__name__, "begin to train on trainer")
+        out_losses = []
+        for i in six.moves.xrange(RUN_STEP):
+            loss, = exe.run(fluid.default_main_program(),
+                            fetch_list=[avg_cost.name],
+                            feed=feeder.feed(get_data()))
+            out_losses.append(loss[0])
+            print_to_err(type(self).__name__, "run step %d finished" % i)
+        print_to_err(type(self).__name__, "trainer run finished")
+        print_to_err(type(self).__name__, "dist losses: {}".format(out_losses))
+
+        if six.PY2:
+            print(pickle.dumps(out_losses))
+        else:
+            sys.stdout.buffer.write(pickle.dumps(out_losses))
+
     def run_use_fleet_api_trainer(self, args):
         assert args.update_method == "nccl2" or "bkcl"
 
@@ -630,6 +700,7 @@ def runtime_main(test_class):
     parser.add_argument('--use_hallreduce', action='store_true')
     parser.add_argument('--use_pipeline', action='store_true')
     parser.add_argument('--use_fleet_api', action='store_true')
+    parser.add_argument('--use_fleet_api_20', action='store_true')
     parser.add_argument('--use_local_sgd', action='store_true')
     parser.add_argument('--ut4grad_allreduce', action='store_true')
     parser.add_argument(
@@ -671,6 +742,8 @@ def runtime_main(test_class):
         model.run_pserver(args)
     elif args.use_fleet_api:
         model.run_use_fleet_api_trainer(args)
+    elif args.use_fleet_api_20:
+        model.run_use_fleet_api_20_trainer(args)
     elif args.use_pipeline:
         model.run_pipeline_trainer(args)
     else:
@@ -734,6 +807,7 @@ class TestDistBase(unittest.TestCase):
         self._nccl_comm_num = 1
         self._enable_backward_deps = False
         self._use_fleet_api = False
+        self._use_fleet_api_20 = False
         self._use_local_sgd = False
         self._ut4grad_allreduce = False
         self._use_hallreduce = False
@@ -1060,7 +1134,7 @@ class TestDistBase(unittest.TestCase):
             tr_cmd += " --fuse_all_reduce {}".format(self._fuse_all_reduce)
 
         if self._use_fleet_api:
-            tr_cmd += " --use_fleet_api"
+            tr_cmd += " --use_fleet_api_20" if self._use_fleet_api_20 else " --use_fleet_api"
             if self._use_local_sgd:
                 tr_cmd += " --use_local_sgd"
             if self._ut4grad_allreduce:
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
index e84e91de0ba79ac195540dce620034e30e70f0d1..80b7eb136479720610214d744c8031a5c5be177b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
@@ -241,42 +241,72 @@ class TestFleetBase(unittest.TestCase):
     def _start_pserver(self, cmd, required_envs):
         ps0_cmd, ps1_cmd = cmd.format(0), cmd.format(1)
 
-        ps0_pipe = open(tempfile.gettempdir() + "/ps0_err.log", "wb+")
-        ps1_pipe = open(tempfile.gettempdir() + "/ps1_err.log", "wb+")
+        log_dirname = required_envs.get("LOG_DIRNAME", tempfile.gettempdir())
+        log_prename = required_envs.get("LOG_PREFIX", "")
+
+        if log_dirname:
+            log_prename += "_"
+
+        ps0_err_log = os.path.join(log_dirname, log_prename + "ps0_stderr.log")
+        ps1_err_log = os.path.join(log_dirname, log_prename + "ps1_stderr.log")
+        ps0_out_log = os.path.join(log_dirname, log_prename + "ps0_stdout.log")
+        ps1_out_log = os.path.join(log_dirname, log_prename + "ps1_stdout.log")
+
+        ps0_err = open(ps0_err_log, "wb+")
+        ps1_err = open(ps1_err_log, "wb+")
+
+        ps0_out = open(ps0_out_log, "wb+")
+        ps1_out = open(ps1_out_log, "wb+")
 
         ps0_proc = subprocess.Popen(
             ps0_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps0_pipe,
+            stdout=ps0_out,
+            stderr=ps0_err,
             env=required_envs)
+
         ps1_proc = subprocess.Popen(
             ps1_cmd.strip().split(" "),
-            stdout=subprocess.PIPE,
-            stderr=ps1_pipe,
+            stdout=ps1_out,
+            stderr=ps1_err,
             env=required_envs)
-        return ps0_proc, ps1_proc, ps0_pipe, ps1_pipe
+
+        return ((ps0_proc, ps0_out, ps0_err, ps0_out_log, ps0_err_log),
+                (ps1_proc, ps1_out, ps1_err, ps1_out_log, ps1_err_log))
 
     def _start_trainer(self, cmd, required_envs):
         tr0_cmd, tr1_cmd = cmd.format(0), cmd.format(1)
 
-        tr0_pipe = open(tempfile.gettempdir() + "/tr0_err.log", "wb+")
-        tr1_pipe = open(tempfile.gettempdir() + "/tr1_err.log", "wb+")
+        log_dirname = required_envs.get("LOG_DIRNAME", tempfile.gettempdir())
+        log_prename = required_envs.get("LOG_PREFIX", "")
+
+        if log_dirname:
+            log_prename += "_"
+
+        tr0_err_log = os.path.join(log_dirname, log_prename + "tr0_stderr.log")
+        tr1_err_log = os.path.join(log_dirname, log_prename + "tr1_stderr.log")
+        tr0_out_log = os.path.join(log_dirname, log_prename + "tr0_stdout.log")
+        tr1_out_log = os.path.join(log_dirname, log_prename + "tr1_stdout.log")
 
-        tr0_out = open(tempfile.gettempdir() + "/tr0_stdout.log", "wb+")
-        tr1_out = open(tempfile.gettempdir() + "/tr1_stdout.log", "wb+")
+        tr0_err = open(tr0_err_log, "wb+")
+        tr1_err = open(tr1_err_log, "wb+")
+
+        tr0_out = open(tr0_out_log, "wb+")
+        tr1_out = open(tr1_out_log, "wb+")
 
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(" "),
             stdout=tr0_out,
-            stderr=tr0_pipe,
+            stderr=tr0_err,
             env=required_envs)
+
         tr1_proc = subprocess.Popen(
             tr1_cmd.strip().split(" "),
             stdout=tr1_out,
-            stderr=tr1_pipe,
+            stderr=tr1_err,
             env=required_envs)
 
-        return tr0_proc, tr1_proc, tr0_pipe, tr1_pipe
+        return ((tr0_proc, tr0_out, tr0_err, tr0_out_log, tr0_err_log),
+                (tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log))
 
     def _run_cluster(self, model, envs):
         env = {'GRAD_CLIP': str(self._grad_clip_mode)}
@@ -303,57 +333,87 @@ class TestFleetBase(unittest.TestCase):
             ps_cmd += " --model_dir {}".format(self._model_dir)
 
         # Run dist train to compare with local results
-        ps0, ps1, ps0_pipe, ps1_pipe = self._start_pserver(ps_cmd, env)
-        tr0, tr1, tr0_pipe, tr1_pipe = self._start_trainer(tr_cmd, env)
+        ps0, ps1 = self._start_pserver(ps_cmd, env)
+        tr0, tr1 = self._start_trainer(tr_cmd, env)
+
+        ps0_proc, ps0_out, ps0_err, ps0_out_log, ps0_err_log = ps0
+        ps1_proc, ps1_out, ps1_err, ps1_out_log, ps1_err_log = ps1
+
+        tr0_proc, tr0_out, tr0_err, tr0_out_log, tr0_err_log = tr0
+        tr1_proc, tr1_out, tr1_err, tr1_out_log, tr1_err_log = tr1
 
         # Wait until trainer process terminate
-        while True:
-            stat0 = tr0.poll()
-            time.sleep(0.1)
-            if stat0 is not None:
-                break
+        time_out = 120
+        cur_time = 0
 
         while True:
-            stat1 = tr1.poll()
-            time.sleep(0.1)
-            if stat1 is not None:
+            stat0 = tr0_proc.poll()
+            stat1 = tr1_proc.poll()
+
+            if stat0 is not None and stat1 is not None:
+                break
+            else:
+                time.sleep(0.5)
+                cur_time += 0.5
+
+            if cur_time >= time_out:
+                tr0_proc.terminate()
+                tr1_proc.terminate()
+                tr0_proc.wait()
+                tr1_proc.wait()
                 break
 
-        tr0_out, tr0_err = tr0.communicate()
-        tr1_out, tr1_err = tr1.communicate()
-
-        tr0_ret = tr0.returncode
-        tr1_ret = tr0.returncode
-        if tr0_ret != 0:
-            print(
-                "========================Error tr0_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr0_err.log"))
-            print(
-                "========================Error tr0_err end==========================="
-            )
-
-        if tr1_ret != 0:
-            print(
-                "========================Error tr1_err begin==========================="
-            )
-            os.system("cat {}".format(tempfile.gettempdir() + "/tr1_err.log"))
-            print(
-                "========================Error tr1_err end==========================="
-            )
-
-        # close trainer file
-        tr0_pipe.close()
-        tr1_pipe.close()
-        ps0_pipe.close()
-        ps1_pipe.close()
-
-        ps0.terminate()
-        ps1.terminate()
+        tr0_ret = tr0_proc.returncode
+        tr1_ret = tr1_proc.returncode
+
+        ps0_proc.kill()
+        ps1_proc.kill()
+        ps0_proc.wait()
+        ps1_proc.wait()
+
+        def is_listen_failed(logx):
+            is_lf = False
+
+            listen_rgx = "Fail to listen"
+
+            with open(logx, "r") as rb:
+                for line in rb.readlines():
+                    if listen_rgx in line:
+                        is_lf = True
+                        break
+            return is_lf
+
+        def catlog(logx):
+            basename = os.path.basename(logx)
+            print("\n================== Error {} begin =====================".
+                  format(basename))
+            os.system("cat {}".format(logx))
+            print("================== Error {} end =====================\n".
+                  format(basename))
+
+        if tr0_ret != 0 or tr1_ret != 0:
+            if is_listen_failed(ps0_err) or is_listen_failed(ps1_err):
+                print("find parameter server port bind failed, skip the error")
+                tr0_ret, tr1_ret = 0, 0
+            else:
+                for out, err in [
+                    (ps0_out_log, ps0_err_log), (ps1_out_log, ps1_err_log),
+                    (tr0_out_log, tr0_err_log), (tr1_out_log, tr1_err_log)
+                ]:
+                    catlog(out)
+                    catlog(err)
+
+        for pipe in [
+                tr0_err, tr0_out, tr1_err, tr1_out, ps0_err, ps0_out, ps1_err,
+                ps1_out
+        ]:
+            pipe.close()
 
         shutil.rmtree(gloo_path)
+
         self.assertEqual(tr0_ret, 0, "something wrong in tr0, please check")
         self.assertEqual(tr1_ret, 0, "something wrong in tr1, please check")
+
         return 0, 0
 
     def check_with_place(self,
@@ -399,6 +459,7 @@ def runtime_main(test_class):
     model = test_class()
     role = model.build_role(args)
 
+    # for distributed inference
     if args.test and args.model_dir != "":
         avg_cost = model.net(args, is_train=False)
         dist_infer = DistributedInfer()
@@ -407,12 +468,16 @@ def runtime_main(test_class):
             loss=model.avg_cost,
             role_maker=role,
             dirname=args.model_dir)
+
         if fleet.is_worker():
             with paddle.static.program_guard(
                     main_program=dist_infer.get_dist_infer_program()):
                 model.do_distributed_testing(fleet)
                 fleet.stop_worker()
-                return
+            return
+
+        if fleet.is_server():
+            return
 
     fleet.init(role)
     strategy = model.build_strategy(args)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
index 1a3ef2b3fda539acb33db6f79bd75b36a0f79b07..3beb1d3dfe0331d09961da7c64ee95987fe025a7 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
@@ -36,7 +36,9 @@ class TestDistMnistAsync2x2(TestFleetBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "2"
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -71,7 +73,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
             "FLAGS_communicator_send_queue_size": "2",
             "FLAGS_communicator_max_merge_var_num": "2",
             "CPU_NUM": "2",
-            "SAVE_MODEL": "0"
+            "SAVE_MODEL": "0",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
index 6791d5bbe319377868e6c27d311ba5d9ec2659db..e73eff2acc9671d398fdf7bb6047effcc5c7cfc3 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
@@ -38,7 +38,9 @@ class TestDistMnistSync2x2(TestFleetBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "2"
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -75,7 +77,9 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
             "dump_param": "concat_0.tmp_0",
             "dump_fields": "dnn-fc-3.tmp_0,dnn-fc-3.tmp_0@GRAD",
             "dump_fields_path": tempfile.mkdtemp(),
-            "Debug": "1"
+            "Debug": "1",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
index a98407294b392f1b8128cc7ad9cb9ac17f44a44c..207953e92b20f6666406979d8c4962f3140be147 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
@@ -42,7 +42,9 @@ class TestDistGeoCtr_2x2(TestFleetBase):
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
-            "http_proxy": ""
+            "http_proxy": "",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -55,7 +57,7 @@ class TestDistGeoCtr_2x2(TestFleetBase):
 
     def test_dist_train(self):
         self.check_with_place(
-            "dist_fleet_ctr.py", delta=1e-5, check_error_log=True)
+            "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
 
 
 class TestGeoSgdTranspiler(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
index 3d24328c9d0c305c9dcfb384ef2be49b0d58d8c6..82a3d73da2c714372435d12e97df837b247fc8ec 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
@@ -27,17 +27,6 @@ class TestDistCtrInfer(TestFleetBase):
     def _setup_config(self):
         self._mode = "async"
         self._reader = "pyreader"
-        self._need_test = 1
-
-        data_url = "https://fleet.bj.bcebos.com/unittest/ctr_saved_params.tar.gz"
-        data_md5 = "aa7e8286ced566ea8a67410be7482438"
-        module_name = "ctr_saved_params"
-        path = download(data_url, module_name, data_md5)
-        print('ctr_params is downloaded at ' + path)
-        tar = tarfile.open(path)
-        unzip_folder = tempfile.mkdtemp()
-        tar.extractall(unzip_folder)
-        self._model_dir = unzip_folder
 
     def check_with_place(self,
                          model_file,
@@ -53,6 +42,8 @@ class TestDistCtrInfer(TestFleetBase):
             "FLAGS_communicator_send_queue_size": "2",
             "FLAGS_communicator_max_merge_var_num": "2",
             "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -64,9 +55,21 @@ class TestDistCtrInfer(TestFleetBase):
         tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs)
 
     def test_dist_infer(self):
+        model_dirname = tempfile.mkdtemp()
+
+        self.check_with_place(
+            "dist_fleet_ctr.py",
+            delta=1e-5,
+            check_error_log=False,
+            need_envs={"SAVE_DIRNAME": model_dirname, })
+
+        self._need_test = 1
+        self._model_dir = model_dirname
+
         self.check_with_place(
             "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
-        shutil.rmtree(self._model_dir)
+
+        shutil.rmtree(model_dirname)
 
 
 class TestDistCtrTrainInfer(TestFleetBase):
@@ -80,6 +83,7 @@ class TestDistCtrTrainInfer(TestFleetBase):
                          delta=1e-3,
                          check_error_log=False,
                          need_envs={}):
+
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -89,6 +93,8 @@ class TestDistCtrTrainInfer(TestFleetBase):
             "FLAGS_communicator_send_queue_size": "2",
             "FLAGS_communicator_max_merge_var_num": "2",
             "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e729bfe0537528ed9d225e65823f1eb4f06a0f5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_dist_base import TestDistBase
+import paddle
+import os
+
+paddle.enable_static()
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestFleetMetaOptimizerPrecision(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+        self._nccl2_reduce_layer = True
+        self._use_fleet_api = True
+        self._use_fleet_api_20 = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "dist_fleet_raw_program_optimizer.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
index 637dafe1c57e196fbd709a9afb0fc09785643c2e..4e0241c1e9c52fa617fe88ea1b0ea30d43e8ed3d 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
@@ -45,7 +45,9 @@ class TestDistMnistSync2x2(TestFleetBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "2"
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -79,7 +81,9 @@ class TestDistMnistAsync2x2(TestFleetBase):
             "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
-            "CPU_NUM": "2"
+            "CPU_NUM": "2",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -114,7 +118,9 @@ class TestDistMnistAsync2x2WithDecay(TestFleetBase):
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
             "CPU_NUM": "2",
-            "DECAY": "0"
+            "DECAY": "0",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -149,7 +155,9 @@ class TestDistMnistAsync2x2WithUnifrom(TestFleetBase):
             "FLAGS_rpc_deadline": "5000",  # 5sec to fail fast
             "http_proxy": "",
             "CPU_NUM": "2",
-            "INITIALIZER": "1"
+            "INITIALIZER": "1",
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
@@ -264,6 +272,7 @@ class TestDistMnistAsync2x2WithGauss(TestFleetBase):
                          check_error_log=False,
                          need_envs={}):
         model_dir = tempfile.mkdtemp()
+
         required_envs = {
             "PATH": os.getenv("PATH", ""),
             "PYTHONPATH": os.getenv("PYTHONPATH", ""),
@@ -272,7 +281,9 @@ class TestDistMnistAsync2x2WithGauss(TestFleetBase):
             "http_proxy": "",
             "CPU_NUM": "2",
             "INITIALIZER": "2",
-            "MODEL_DIR": model_dir
+            "MODEL_DIR": model_dir,
+            "LOG_DIRNAME": "/tmp",
+            "LOG_PREFIX": self.__class__.__name__,
         }
 
         required_envs.update(need_envs)
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
index 6de04c14bfa7080bcbf5e3b4c55f98da0f09a863..332603b812955000b4a58d31fd14b21225a9a0c8 100755
--- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
@@ -92,15 +92,12 @@ class Naive_fc_net(paddle.nn.Layer):
         return inputs
 
 
-def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+def run_model(recompute_block=[], recompute_kwargs={}, enable_autocast=False):
     gen = paddle.seed(10)
     gen.manual_seed(10)
     np.random.seed(10)
     random.seed(10)
 
-    if cuda_state:
-        paddle.set_cuda_rng_state(cuda_state)
-
     batch_size, input_size = 1, 10
     model = Naive_fc_net(
         input_size,
@@ -110,19 +107,27 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
     optimizer = paddle.optimizer.SGD(learning_rate=0.01,
                                      parameters=model.parameters())
 
+    if enable_autocast:
+        scaler = paddle.amp.GradScaler()
+
     loss_ = []
     param_ = []
     grad_ = []
     for step in range(10):
+
         x_data = np.random.randn(batch_size, input_size).astype(np.float32)
         x = paddle.to_tensor(x_data)
         # x.stop_gradient = False
-        y_pred = model(x)
-        loss = y_pred.mean()
-
-        loss_.append(np.asarray(loss).tolist())
-        loss.backward()
-        optimizer.step()
+        with paddle.amp.auto_cast(True):
+            y_pred = model(x)
+            loss = y_pred.mean()
+        if enable_autocast:
+            scaler.scale(loss).backward()
+            scaler.minimize(optimizer, loss)
+        else:
+            loss_.append(np.asarray(loss).tolist())
+            loss.backward()
+            optimizer.step()
 
         param_.append(np.asarray(model.parameters()[9]).tolist())
         grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
@@ -138,25 +143,57 @@ class TestPyLayer(unittest.TestCase):
             self.assertEqual(param_ref, param)
             self.assertEqual(grad_ref, grad)
 
-        cuda_state = paddle.get_cuda_rng_state()
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(recompute_block=[])
+
+        # recompute second block
+        loss, param, grad = run_model(recompute_block=[1])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute fourth block
+        loss, param, grad = run_model(recompute_block=[3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second to fourth block
+        loss, param, grad = run_model(recompute_block=[1, 2, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute second & fourth block
+        loss, param, grad = run_model(recompute_block=[1, 3])
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_fc_net_without_restore_rng(self):
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[2],
+            recompute_kwargs={"preserve_rng_state": False},
+            enable_autocast=True)
+
+    def test_fc_net_with_amp(self):
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
         # without recompute
         loss_ref, param_ref, grad_ref = run_model(
-            cuda_state, recompute_block=[])
+            recompute_block=[], enable_autocast=True)
 
         # recompute second block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        loss, param, grad = run_model(recompute_block=[1], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[3])
+        loss, param, grad = run_model(recompute_block=[3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second to fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 2, 3])
+        loss, param, grad = run_model(
+            recompute_block=[1, 2, 3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
         # recompute second & fourth block
-        loss, param, grad = run_model(cuda_state, recompute_block=[1, 3])
+        loss, param, grad = run_model(
+            recompute_block=[1, 3], enable_autocast=True)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
     def test_recompute_kwargs(self):
@@ -164,12 +201,12 @@ class TestPyLayer(unittest.TestCase):
         kwargs = {"is_test": False}
         with self.assertRaises(ValueError):
             loss_ref, param_ref, grad_ref = run_model(
-                None, recompute_block=[2], recompute_kwargs=kwargs)
+                recompute_block=[2], recompute_kwargs=kwargs)
 
     def test_recompute_cpu_rng(self):
         paddle.set_device("cpu")
         with self.assertRaises(RuntimeError):
-            loss_ref, param_ref, grad_ref = run_model(None, recompute_block=[2])
+            loss_ref, param_ref, grad_ref = run_model(recompute_block=[2])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
index be5e87b9d344bb4e56955996c2a730d297373279..af020548af376b0c432d04d8d54922ef9d0d6dae 100755
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -530,7 +530,8 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer):
             'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant',
             'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant',
             'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
-            'c_gen_nccl_id', 'c_comm_init', 'fill_constant', 'c_allreduce_sum',
+            'c_sync_calc_stream', 'c_gen_nccl_id', 'c_comm_init',
+            'fill_constant', 'c_allreduce_sum', 'c_sync_calc_stream',
             'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init'
         ])
 
diff --git a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
index d5056bd11cf0845e8694e75d805f4197a5fb2024..49a3dedbf26a2f104ea5e7a480447b938ed8b19b 100644
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 import unittest
 import numpy as np
 import paddle.fluid.core as core
-from op_test import OpTest
+from op_test import OpTest, skip_check_grad_ci
+paddle.enable_static()
 
 
 def AffineGrid(theta, grid_shape):
@@ -160,7 +162,6 @@ class TestGridSamplerOp(OpTest):
             "padding_mode": self.padding_mode,
             "mode": self.mode
         }
-        #    print("X: {}".format(x))
         self.outputs = {
             'Output': GridSampler(x, grid, self.align_corners, self.mode,
                                   self.padding_mode)
@@ -237,5 +238,41 @@ class Case4(TestGridSamplerOp):
         self.numeric_grad_delta = 0.0001
 
 
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class LargeInputCase(TestGridSamplerOp):
+    def get_places(self):
+        places = []
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = False
+        self.padding_mode = "reflection"
+        self.mode = "bilinear"
+
+    def test_check_grad_normal(self):
+        pass
+
+
+@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " +
+                    "however it is desirable to cover the forward pass")
+class Case5(LargeInputCase):
+    def initTestCase(self):
+        self.no_need_check_grad = True
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+        self.use_cudnn = False if core.is_compiled_with_rocm() else True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 2ba79cc9e4396b3adf0cf1f942df47a6f11e4ac1..0e13ca175620254f45f26f0a2426705895b216e4 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -53,6 +53,15 @@ class TestDygraphGroupNormv2(unittest.TestCase):
                         weight_attr=False,
                         bias_attr=False)
 
+            def test_nn_exception():
+                with fluid.dygraph.guard(p):
+
+                    def attr_data_format():
+                        out = paddle.nn.GroupNorm(
+                            num_groups=2, num_channels=2, data_format="NHWC")
+
+                    self.assertRaises(ValueError, attr_data_format)
+
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
@@ -61,6 +70,7 @@ class TestDygraphGroupNormv2(unittest.TestCase):
                 print("y1:", y1, "\ty2:", y2)
             self.assertTrue(result)
             test_weight_bias_false()
+            test_nn_exception()
 
     def test_static(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/test_lambv2_op.py b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
index 7ffc056812f2ba1f1d1ace5d1fdc3fcf226dbd05..861418679a36620d2a31bf375de50c65cc10b5ea 100644
--- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
@@ -155,5 +155,31 @@ class TestLambOpWithCombinedOp(unittest.TestCase):
             self.assertTrue(np.allclose(out, output))
 
 
+class TestLambOpV2Group(TestLambOpV2):
+    def test_lamb_op(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Lamb(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'lamb_weight_decay': 0.001,
+                'beta1': 0.9,
+                'beta2': 0.99
+            }],
+            lamb_weight_decay=0.01)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_lgamma_op.py b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..686d5b1eb6dfefc024ffb435f802dea25fe1d2e0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import math
+import numpy as np
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestLgammaOp(OpTest):
+    def setUp(self):
+        self.op_type = 'lgamma'
+        self.init_dtype_type()
+        shape = (5, 20)
+        data = np.random.random(shape).astype(self.dtype) + 1
+        self.inputs = {'X': data}
+        result = np.ones(shape).astype(self.dtype)
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                result[i][j] = math.lgamma(data[i][j])
+        self.outputs = {'Out': result}
+
+    def init_dtype_type(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=1e-7)
+
+
+class TestLgammaOpFp32(TestLgammaOp):
+    def init_dtype_type(self):
+        self.dtype = np.float32
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.005)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_marker_op.py b/python/paddle/fluid/tests/unittests/test_marker_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f9f8c7d6bc8c1b9f2f346ae464a1d90e4507c33
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_marker_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
+
+
+class TestMarkerOp(OpTest):
+    def setUp(self):
+        self.op_type = "marker"
+        self.inputs = {}
+        self.attrs = {
+            'marker_role': 'forward',
+            'marker_pos': 'B',
+            'op_role': OpRole.Forward
+        }
+        self.outputs = {}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 8f629b15224287bdb4f53de90cfc526bf12ad4d8..e31587b225ebae4e5a72faa43a2e3bc31263d0d1 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest):
         self.check_output()
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLarsMomentumOpWithMP(OpTest):
+    def setUp(self):
+        self.op_type = "lars_momentum"
+
+        master_param = np.random.random((123, 321)).astype("float32")
+        param = master_param.astype("float16")
+        grad = np.random.random((123, 321)).astype("float16")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        lars_coeff = 0.001
+        lars_weight_decay = 0.0005
+        rescale_grad = 1.0
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate,
+            'MasterParam': master_param,
+        }
+
+        self.attrs = {
+            'mu': mu,
+            'lars_coeff': lars_coeff,
+            'lars_weight_decay': lars_weight_decay,
+            'multi_precision': True,
+            'rescale_grad': rescale_grad
+        }
+
+        fp32_grad = grad.astype("float32")
+        pnorm = np.sqrt(np.square(master_param).sum())
+        gnorm = np.sqrt(np.square(fp32_grad).sum())
+        local_lr = learning_rate * lars_coeff * pnorm / (
+            gnorm + lars_weight_decay * pnorm)
+        fp32_grad = fp32_grad * rescale_grad
+        velocity_out = mu * velocity + local_lr * (fp32_grad + lars_weight_decay
+                                                   * master_param)
+        p_new = master_param - velocity_out
+        param_out = p_new.astype("float16")
+        master_param_out = p_new
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'VelocityOut': velocity_out,
+            'MasterParamOut': master_param_out
+        }
+
+    def test_check_output(self):
+        paddle.enable_static()
+        if core.is_compiled_with_cuda():
+            place = fluid.CUDAPlace(0)
+            if core.is_float16_supported(place):
+                self.check_output_with_place(place)
+
+
 class TestLarsMomentumOp(OpTest):
     def setUp(self):
         self.op_type = "lars_momentum"
@@ -610,5 +668,32 @@ class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
             self.__test_vs(place=place)
 
 
+class TestMomentumV2Group(TestMomentumV2):
+    def test_momentum_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001,
+                'learning_rate': 0.1,
+                'momentum': 0.99
+            }],
+            weight_decay=0.1,
+            momentum=0.9)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
index ddac7f6b98b19d204d20ccdff75c6d4fcae50d4d..08ab2e18c733a6ba4bad904f10abce2baf9517ed 100644
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
@@ -240,6 +240,7 @@ class TestRMSPropV2(unittest.TestCase):
         adam.clear_gradients()
 
     def test_rmsprop(self):
+        paddle.enable_static()
         place = fluid.CPUPlace()
         main = fluid.Program()
         with fluid.program_guard(main):
@@ -290,5 +291,29 @@ class TestRMSPropV2(unittest.TestCase):
                 0.1, rho=-1, parameters=linear.parameters())
 
 
+class TestRMSPropV2Group(TestRMSPropV2):
+    def test_rmsprop_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.RMSProp(
+            learning_rate=0.01,
+            parameters=[{
+                'params': linear_1.parameters()
+            }, {
+                'params': linear_2.parameters(),
+                'weight_decay': 0.001
+            }],
+            weight_decay=0.01)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py
index f6332859f92f7af78bb664c3b12038ce9f767096..81490642fa8c12f1d67bdcd3fbe128d82c65daff 100644
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
@@ -19,10 +19,13 @@ import unittest
 import numpy as np
 import six
 
+import paddle
 import paddle.fluid as fluid
 from paddle import compat as cpt
 from paddle.fluid import core, framework, executor
 
+paddle.enable_static()
+
 
 @contextlib.contextmanager
 def program_scope_guard():
@@ -164,6 +167,8 @@ class RunProgramOpTest(unittest.TestCase):
             persistable=True)
         inner_scope = core.Scope()
         outputs['OutScope'].value().set_scope(inner_scope)
+
+        outputs['DOut'] = [create_var_base(False, "Fake_var")]
         return outputs
 
     def calc_dygraph_output(self, place):
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 2c87e06e893a4d6495ad81ac3dcdf375a41272fb..afa004e769e092317a7fbf9551d067dc19f9c0f8 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -225,6 +225,7 @@ class TestSGDV2(unittest.TestCase):
         adam.clear_gradients()
 
     def test_sgd(self):
+        paddle.enable_static()
         place = fluid.CPUPlace()
         main = fluid.Program()
         with fluid.program_guard(main):
@@ -250,5 +251,29 @@ class TestSGDV2(unittest.TestCase):
         self.assertRaises(ValueError, paddle.optimizer.SGD, learning_rate=None)
 
 
+class TestSGDV2Group(TestSGDV2):
+    def test_sgd_dygraph(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear_1 = paddle.nn.Linear(13, 5)
+        linear_2 = paddle.nn.Linear(5, 3)
+        # This can be any optimizer supported by dygraph.
+        adam = paddle.optimizer.SGD(learning_rate=0.01,
+                                    parameters=[{
+                                        'params': linear_1.parameters()
+                                    }, {
+                                        'params': linear_2.parameters(),
+                                        'weight_decay': 0.001,
+                                        'learning_rate': 0.1
+                                    }],
+                                    weight_decay=0.01)
+        out = linear_1(a)
+        out = linear_2(out)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
index 6efab81a265ea8d849b42a1be130ecfae14d269f..14547eca5aca2cf6411eb9ea3b321f96dcc831fa 100644
--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
@@ -20,7 +20,7 @@ import unittest
 
 import paddle
 import paddle.distributed as dist
-from paddle.distributed.spawn import _get_subprocess_env_list, _options_valid_check
+from paddle.distributed.spawn import _get_subprocess_env_list, _options_valid_check, _get_default_nprocs
 
 from paddle.fluid import core
 from paddle.fluid.dygraph import parallel_helper
@@ -87,6 +87,15 @@ class TestSpawnAssistMethod(unittest.TestCase):
             options['error'] = "error"
             _options_valid_check(options)
 
+    def test_get_default_nprocs(self):
+        paddle.set_device('cpu')
+        with self.assertRaises(RuntimeError):
+            nprocs = _get_default_nprocs()
+
+        paddle.set_device('gpu')
+        nprocs = _get_default_nprocs()
+        self.assertEqual(nprocs, core.get_cuda_device_count())
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 13aa7d3d37dd4f5253acc04661dce09cb6925435..47a6d2b811552763506e4b213894eead7c992e2d 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -248,7 +248,7 @@ class TestConvertSyncBatchNorm(unittest.TestCase):
                         isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
 
 
-class TestConvertSyncBatchNormCase2(unittest.TestCase):
+class TestConvertSyncBatchNormCast1(unittest.TestCase):
     def test_convert(self):
         if not core.is_compiled_with_cuda():
             return
@@ -277,5 +277,70 @@ class TestConvertSyncBatchNormCase2(unittest.TestCase):
         self.assertEqual(len(compare_model.sublayers()), len(model.sublayers()))
 
 
+class TestConvertSyncBatchNormCase2(unittest.TestCase):
+    def test_convert(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with fluid.dygraph.guard(fluid.CUDAPlace(0)):
+
+            class SyBNNet(paddle.nn.Layer):
+                def __init__(self, in_ch=3, out_ch=3, dirate=1):
+                    super(SyBNNet, self).__init__()
+                    self.bn_s1 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(
+                            out_ch,
+                            weight_attr=paddle.ParamAttr(
+                                regularizer=paddle.regularizer.L2Decay(0.))))
+                    self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(
+                            out_ch, data_format='NDHWC'))
+
+                def forward(self, x):
+                    x = self.bn_s1(x)
+                    out = paddle.sum(paddle.abs(self.bn_s2(x)))
+                    return out
+
+            class BNNet(paddle.nn.Layer):
+                def __init__(self, in_ch=3, out_ch=3, dirate=1):
+                    super(BNNet, self).__init__()
+                    self.bn_s1 = paddle.nn.BatchNorm3D(
+                        out_ch,
+                        weight_attr=paddle.ParamAttr(
+                            regularizer=paddle.regularizer.L2Decay(0.)))
+                    self.bn_s2 = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
+                        paddle.nn.BatchNorm3D(
+                            out_ch, data_format='NDHWC'))
+
+                def forward(self, x):
+                    x = self.bn_s1(x)
+                    out = paddle.sum(paddle.abs(self.bn_s2(x)))
+                    return out
+
+            bn_model = BNNet()
+            sybn_model = SyBNNet()
+            np.random.seed(10)
+            data = np.random.random([3, 3, 3, 3, 3]).astype('float32')
+            x = paddle.to_tensor(data)
+            bn_out = bn_model(x)
+            sybn_out = sybn_model(x)
+            self.assertTrue(
+                np.allclose(bn_out.numpy(), sybn_out.numpy()),
+                "Output has diff. \n" + "\nBN     " + str(bn_out.numpy()) + "\n"
+                + "Sync BN " + str(sybn_out.numpy()))
+
+
+class TestDygraphSyncBatchNormDataFormatError(unittest.TestCase):
+    def test_errors(self):
+        if not core.is_compiled_with_cuda():
+            return
+
+        with fluid.dygraph.guard(fluid.CUDAPlace(0)):
+            my_sync_batch_norm = paddle.nn.SyncBatchNorm(10, data_format='CN')
+            data = np.random.random([3, 3, 3]).astype('float32')
+            x = paddle.to_tensor(data)
+            self.assertRaises(ValueError, my_sync_batch_norm, x)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
index 5f2dfbdd99e1611c61883b9a34cccc5ac0ec8b71..ba375f8b3c8a41726c39cd890d53edcd33bbf6f4 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
@@ -187,6 +187,13 @@ class TestTensorScalarTypePromotionDynamic(unittest.TestCase):
         c = paddle.full([2, 2, 2], 0.5, dtype="float32")
         self.check_operation(a, b, c, '/')
 
+        # tensor(float32) / scalar(int)
+        # this behavior should be equal to elementwise_div Op
+        a = paddle.to_tensor([99, 99, 99], dtype='float32')
+        b = 100
+        c = a / paddle.to_tensor([100, 100, 100], dtype='float32')
+        self.check_operation(a, b, c, '/')
+
         # tensor(int64) / scalar(float, .0)
         a = paddle.ones([2, 2, 2], dtype='int64')
         b = 2.0
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
index d697666e12ddd15859b2908b42f43202e3de93ab..aa24161687004b5155b429752d684053a487abb4 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
@@ -218,6 +218,12 @@ class TestTensorScalarTypePromotionStatic(unittest.TestCase):
             c = paddle.full([2, 2, 2], 0.5, dtype="float32")
             self.check_operation(a, b, c, '/')
 
+            # this behavior should be equal to elementwise_div Op
+            a = paddle.full([2, 2, 2], 99, dtype="float32")
+            b = 100
+            c = a / paddle.full([2, 2, 2], 100, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
         # tensor(int64) / scalar(float, .0)
         with program_guard(Program()):
             a = paddle.ones([2, 2, 2], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index b3671327ca2959e9331bb4875fabee19f72010ae..b8d29d482fefa928f949618ed7bf697a9675df1b 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -230,6 +230,14 @@ class TestVarBase(unittest.TestCase):
             _test_place(core.CUDAPlace(0))
             _test_place("gpu:0")
 
+    def test_to_tensor_not_change_input_stop_gradient(self):
+        with paddle.fluid.dygraph.guard(core.CPUPlace()):
+            a = paddle.zeros([1024])
+            a.stop_gradient = False
+            b = paddle.to_tensor(a)
+            self.assertEqual(a.stop_gradient, False)
+            self.assertEqual(b.stop_gradient, True)
+
     def test_to_tensor_change_place(self):
         if core.is_compiled_with_cuda():
             a_np = np.random.rand(1024, 1024)
@@ -260,8 +268,9 @@ class TestVarBase(unittest.TestCase):
             with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
                 lod_tensor = core.LoDTensor()
                 lod_tensor.set(a_np, core.CUDAPlace(0))
-                a = paddle.to_tensor(lod_tensor)
+                a = paddle.to_tensor(lod_tensor, place=core.CPUPlace())
                 self.assertTrue(np.array_equal(a_np, a.numpy()))
+                self.assertTrue(a.place.__repr__(), "CPUPlace")
 
     def test_to_variable(self):
         with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py
index 71051689dbc1572600e34d920e1b1f43ca2e9524..6ffecd33f8f48d69ffc7593cb684a93f2d4be226 100644
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
@@ -164,12 +164,75 @@ class TestVariable(unittest.TestCase):
             self.assertTrue(
                 np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
 
-    def test_slice(self):
-        place = fluid.CPUPlace()
-        self._test_slice(place)
+    def _test_slice_index_tensor(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [1, 0]
+            idx1 = [0, 1]
+            idx2 = [0, 0]
+            idx3 = [1, 1]
+
+            out0 = x[paddle.assign(np.array(idx0))]
+            out1 = x[paddle.assign(np.array(idx1))]
+            out2 = x[paddle.assign(np.array(idx2))]
+            out3 = x[paddle.assign(np.array(idx3))]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(IndexError):
+            one = paddle.ones(shape=[1])
+            res = x[one, [0, 0]]
+
+    def _test_slice_index_list(self, place):
+        data = np.random.rand(2, 3).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            idx0 = [1, 0]
+            idx1 = [0, 1]
+            idx2 = [0, 0]
+            idx3 = [1, 1]
+
+            out0 = x[idx0]
+            out1 = x[idx1]
+            out2 = x[idx2]
+            out3 = x[idx3]
+
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=[out0, out1, out2, out3])
+
+        expected = [data[idx0], data[idx1], data[idx2], data[idx3]]
+
+        self.assertTrue((result[0] == expected[0]).all())
+        self.assertTrue((result[1] == expected[1]).all())
+        self.assertTrue((result[2] == expected[2]).all())
+        self.assertTrue((result[3] == expected[3]).all())
+
+        with self.assertRaises(IndexError):
+            res = x[[1, 0], [0, 0]]
+
+        with self.assertRaises(TypeError):
+            res = x[[1.2, 0]]
 
+    def test_slice(self):
+        places = [fluid.CPUPlace()]
         if core.is_compiled_with_cuda():
-            self._test_slice(core.CUDAPlace(0))
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._test_slice(place)
+            self._test_slice_index_tensor(place)
+            self._test_slice_index_list(place)
 
     def _tostring(self):
         b = default_main_program().current_block()
@@ -232,5 +295,61 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(Exception, _test)
 
 
+class TestVariableSlice(unittest.TestCase):
+    def _test_item_none(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out0 = x[0:, None, 1:]
+            out1 = x[0:, None]
+            out2 = x[None, 1:]
+            out3 = x[None]
+
+        outs = [out0, out1, out2, out3]
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=outs)
+
+        expected = [
+            data[0:, None, 1:], data[0:, None], data[None, 1:], data[None]
+        ]
+        for i in range(len(outs)):
+            self.assertEqual(outs[i].shape, expected[i].shape)
+            self.assertTrue((result[i] == expected[i]).all())
+
+    def _test_item_none_and_decrease(self, place):
+        data = np.random.rand(2, 3, 4).astype("float32")
+        prog = paddle.static.Program()
+        with paddle.static.program_guard(prog):
+            x = paddle.assign(data)
+            out0 = x[0, 1:, None]
+            out1 = x[0, None]
+            out2 = x[None, 1]
+            out3 = x[None]
+            out4 = x[0, 0, 0, None]
+            out5 = x[None, 0, 0, 0, None]
+
+        outs = [out0, out1, out2, out3, out4, out5]
+        exe = paddle.static.Executor(place)
+        result = exe.run(prog, fetch_list=outs)
+        expected = [
+            data[0, 1:, None], data[0, None], data[None, 1], data[None],
+            data[0, 0, 0, None], data[None, 0, 0, 0, None]
+        ]
+
+        for i in range(len(outs)):
+            self.assertEqual(outs[i].shape, expected[i].shape)
+            self.assertTrue((result[i] == expected[i]).all())
+
+    def test_slice(self):
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self._test_item_none(place)
+            self._test_item_none_and_decrease(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
index 6076e9dc9f60405c4b5e4dde002191e9f1fdcd5b..c771531b7b61be7933b5355204c532b847b13dc5 100644
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
@@ -45,6 +45,7 @@ NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST = [
     'bilateral_slice',\
     'cudnn_lstm', \
     'rnn', \
+    'lgamma', \
 ]
 
 NEED_FIX_FP64_CHECK_OUTPUT_THRESHOLD_OP_LIST = ['bilinear_interp',\
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
index 3eefa0bce886367ad8b80f30a5bfd884ae613ded..7b74a8bb3836597dacae467e459584506979540f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
@@ -82,11 +82,8 @@ class TestAssignOpError(unittest.TestCase):
             x1 = fluid.create_lod_tensor(
                 np.array([[-1]]), [[1]], fluid.XPUPlace(0))
             self.assertRaises(TypeError, fluid.layers.assign, x1)
-            # When the type of input is Variable, the dtype of input must be float16, float32, float64, int32, int64, bool.
-            x3 = fluid.layers.data(name='x3', shape=[4], dtype="uint8")
-            self.assertRaises(TypeError, fluid.layers.assign, x3)
-            x4 = np.array([[2.5, 2.5]], dtype='uint8')
-            self.assertRaises(TypeError, fluid.layers.assign, x4)
+            x2 = np.array([[2.5, 2.5]], dtype='uint8')
+            self.assertRaises(TypeError, fluid.layers.assign, x2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 989db9efea119d5579710235d28729ee980fd92f..92a900e6c371586eb23dbda06345cfe449912ea6 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -17,7 +17,7 @@ import sys
 import os
 __all__ = [
     'TrainerDesc', 'MultiTrainer', 'DistMultiTrainer', 'PipelineTrainer',
-    'HeterXpuTrainer', 'HeterBoxWorker'
+    'HeterXpuTrainer', 'HeterBoxTrainer'
 ]
 
 
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 00dea8d1251f4b2446fce13ca8aff665a35d0d97..95379a34c22144b1a17fcced5556291de15eaaa5 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -27,7 +27,7 @@ from .device_worker import Hogwild, DownpourSGD, Section, DownpourSGDOPT
 from .framework import Variable
 from multiprocessing import Process, Manager
 
-__all__ = ["TrainerFactory", "FetchHandler", "FetchHandlerMonitor"]
+__all__ = ["TrainerFactory", "FetchHandlerMonitor"]
 
 
 class TrainerFactory(object):
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index 242b5b14db2bcc3e32f9d5d7144d50a822cb99c6..aed8c82d43b4dda373d30916ac291b4eff8a1064 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -50,6 +50,17 @@ def replace_ellipsis(var, item):
     return item
 
 
+def replace_none(item):
+    new_item = []
+    none_axes = []
+    for i, slice_item in enumerate(item):
+        if slice_item is None:
+            none_axes.append(i)
+        else:
+            new_item.append(slice_item)
+    return new_item, none_axes
+
+
 def is_integer_or_scalar_tensor(ele):
     from .framework import Variable
     if isinstance(ele, int):
@@ -87,7 +98,7 @@ def _getitem_impl_(var, item):
     Returns:
         Sliced variable
     """
-    from .framework import default_main_program
+    from .framework import default_main_program, Variable
 
     if not isinstance(item, tuple):
         item = (item, )
@@ -97,9 +108,10 @@ def _getitem_impl_(var, item):
     starts = []
     ends = []
     steps = []
-    reverse_axis = []
+    reverse_axes = []
 
     use_strided_slice = False
+    item, none_axes = replace_none(item)
 
     for dim, slice_item in enumerate(item):
         if is_integer_or_scalar_tensor(slice_item):
@@ -120,12 +132,37 @@ def _getitem_impl_(var, item):
 
             if start is None and end is None:
                 assert (step == -1)
-                reverse_axis.append(dim)
+                reverse_axes.append(dim)
                 continue
 
             start = 0 if start is None else start
             end = MAX_INTEGER if end is None else end
 
+        elif isinstance(slice_item, list):
+            for i in slice_item:
+                if not isinstance(i, int):
+                    raise TypeError("Only support int value in list")
+
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a list, its length must be 1, but received {}".
+                    format(len(item)))
+
+            from .layers import assign
+            from ..tensor import index_select
+
+            idx = assign(np.array(slice_item))
+            return index_select(var, index=idx, axis=0)
+
+        elif isinstance(slice_item, Variable):
+            if len(item) != 1:
+                raise IndexError(
+                    "When index contains a Tensor, its length must be 1, but received {}".
+                    format(len(item)))
+
+            from ..tensor import index_select
+            return index_select(var, index=slice_item, axis=0)
+
         else:
             raise IndexError(
                 "Valid index accept int or slice or ellipsis, but received {}.".
@@ -170,9 +207,38 @@ def _getitem_impl_(var, item):
             attrs=attrs)
         out = slice_out_var
 
-    if len(reverse_axis) > 0:
+    if len(reverse_axes) > 0:
         from .layers.tensor import reverse
-        out = reverse(out, axis=reverse_axis)
+        out = reverse(out, axis=reverse_axes)
+
+    # Deal with cases when all axes are decreased.
+    # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+    # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+    # For example:
+    # # x.shape: (2,3,4)
+    # out = x[0, 1, 1, None] # out.shape : (1)
+    if len(decrease_axes) == len(var.shape):
+        none_axes = none_axes[1:]
+
+    if len(none_axes) > 0:
+        # Deal with cases that decrease_axes is not empty
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 0:2, None] # out.shape : (2, 1, 4)
+        for idx, axis in enumerate(none_axes):
+            l = len([i for i in decrease_axes if i < axis])
+            new_axis = axis - l
+            none_axes[idx] = new_axis
+
+        # Deal with cases when all axes are decreased.
+        # After slice, the shape of out is [1], which should have been [], but Paddle doesn't support scalar.
+        # In order to ensure the correctness of the final shape of out, one dimension of out needs to be decreased.
+        # For example:
+        # # x.shape: (2,3,4)
+        # out = x[0, 1, 1, None] # out.shape : (1)
+
+        from ..tensor import unsqueeze
+        out = unsqueeze(out, axis=none_axes)
 
     return out
 
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 61ae8b42d63a909cc4dc88d4b16f0b0e8ed83c71..834b92f9fe6a0c0abaa0946d1676029d4849ae45 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -395,6 +395,10 @@ class ProgBarLogger(Callback):
             values.append(
                 ('ips', "%.5f samples/sec" %
                  (samples / (timer['data_time'] + timer['batch_time']))))
+            timer['count'] = 0
+            timer['samples'] = 0
+            timer['data_time'] = 0.
+            timer['batch_time'] = 0.
 
         progbar.update(steps, values)
 
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index 54765c1d4d41cb80c42690909d1bc743f33cbed0..243bd79c191dd6af08a0dae769ca2de630d42b40 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -109,7 +109,9 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
 
         url = _git_archive_link(repo_owner, repo_name, branch, source=source)
 
-        get_path_from_url(url, hub_dir, decompress=False)
+        fpath = get_path_from_url(
+            url, hub_dir, check_exist=not force_reload, decompress=False)
+        shutil.move(fpath, cached_file)
 
         with zipfile.ZipFile(cached_file) as cached_zipfile:
             extraced_repo_name = cached_zipfile.infolist()[0].filename
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index d78196d94451ed38525c27f729b34e73761e92b3..93f1a5a37a67f1c5a9063a5d2e6e31160e776112 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -80,6 +80,23 @@ def summary(net, input_size, dtypes=None):
             params_info = paddle.summary(lenet, (1, 1, 28, 28))
             print(params_info)
 
+            # multi input demo
+            class LeNetMultiInput(LeNet):
+
+                def forward(self, inputs, y):
+                    x = self.features(inputs)
+
+                    if self.num_classes > 0:
+                        x = paddle.flatten(x, 1)
+                        x = self.fc(x + y)
+                    return x
+            
+            lenet_multi_input = LeNetMultiInput()
+
+            params_info = paddle.summary(lenet_multi_input, [(1, 1, 28, 28), (1, 400)], 
+                                        ['float32', 'float32'])
+            print(params_info)
+
     """
     if isinstance(input_size, InputSpec):
         _input_size = tuple(input_size.shape)
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 5f63a3169f8ac7a09283bed98cc6f96d1193064b..6ed33f4f960b402fc97f32342a54c1c9ffd6e889 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -33,7 +33,8 @@ class ProgressBar(object):
                  width=30,
                  verbose=1,
                  start=True,
-                 file=sys.stdout):
+                 file=sys.stdout,
+                 name='step'):
         self._num = num
         if isinstance(num, int) and num <= 0:
             raise TypeError('num should be None or integer (> 0)')
@@ -47,6 +48,7 @@ class ProgressBar(object):
         if start:
             self._start = time.time()
         self._last_update = 0
+        self.name = name
 
         self._dynamic_display = (
             (hasattr(self.file, 'isatty') and
@@ -74,7 +76,7 @@ class ProgressBar(object):
         self.file.flush()
         self._start = time.time()
 
-    def update(self, current_num, values=None):
+    def update(self, current_num, values={}):
         now = time.time()
 
         if current_num:
@@ -83,11 +85,11 @@ class ProgressBar(object):
             time_per_unit = 0
 
         if time_per_unit >= 1 or time_per_unit == 0:
-            fps = ' - %.0fs/%s' % (time_per_unit, 'step')
+            fps = ' - %.0fs/%s' % (time_per_unit, self.name)
         elif time_per_unit >= 1e-3:
-            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, 'step')
+            fps = ' - %.0fms/%s' % (time_per_unit * 1e3, self.name)
         else:
-            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, 'step')
+            fps = ' - %.0fus/%s' % (time_per_unit * 1e6, self.name)
 
         info = ''
         if self._verbose == 1:
@@ -102,7 +104,7 @@ class ProgressBar(object):
             if self._num is not None:
                 numdigits = int(np.log10(self._num)) + 1
 
-                bar_chars = ('step %' + str(numdigits) + 'd/%d [') % (
+                bar_chars = (self.name + ' %' + str(numdigits) + 'd/%d [') % (
                     current_num, self._num)
                 prog = float(current_num) / self._num
                 prog_width = int(self._width * prog)
@@ -116,7 +118,7 @@ class ProgressBar(object):
                 bar_chars += ('.' * (self._width - prog_width))
                 bar_chars += ']'
             else:
-                bar_chars = 'step %3d' % current_num
+                bar_chars = self.name + ' %3d' % current_num
 
             self._total_width = len(bar_chars)
             sys.stdout.write(bar_chars)
@@ -162,10 +164,10 @@ class ProgressBar(object):
         elif self._verbose == 2 or self._verbose == 3:
             if self._num:
                 numdigits = int(np.log10(self._num)) + 1
-                count = ('step %' + str(numdigits) + 'd/%d') % (current_num,
-                                                                self._num)
+                count = (self.name + ' %' + str(numdigits) + 'd/%d') % (
+                    current_num, self._num)
             else:
-                count = 'step %3d' % current_num
+                count = self.name + ' %3d' % current_num
             info = count + info
 
             for k, val in values:
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 65b9c6771c4f190e74b5228d5233d504d5b3fc09..57ce6c78e958f8ebf256a2c3e2b48231964a81fe 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1449,8 +1449,11 @@ def linear(x, weight, bias=None, name=None):
         pre_bias = _varbase_creator(dtype=x.dtype)
         core.ops.matmul(x, weight, pre_bias, 'transpose_X', False,
                         'transpose_Y', False, "alpha", 1)
-        return dygraph_utils._append_bias_in_dygraph(
-            pre_bias, bias, axis=len(x.shape) - 1)
+
+        if bias is None:
+            return pre_bias
+
+        return core.ops.elementwise_add(pre_bias, bias)
     else:
         helper = LayerHelper('linear', **locals())
         dtype = x.dtype
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 1edbc5f462ecd7b49270b74434604b700e5079d8..67958b8683fe174d2c9e387668ab8c7ee4a39276 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -85,6 +85,10 @@ def _update_padding_nd(padding, channel_last, num_dims):
     else:
         padding_algorithm = "EXPLICIT"
         padding = utils.convert_to_list(padding, num_dims, 'padding')
+    if not all([p >= 0 for p in padding]):
+        raise ValueError(
+            "Invalid padding, all value should be larger than or equal to 0, but received: {}".
+            format(padding))
     return padding, padding_algorithm
 
 
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 45640a6598e576c1264ed47f9a1e0a540d2d6abe..14b3419b81ff0187b77ff754435178426754a4f5 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -375,7 +375,7 @@ class GroupNorm(layers.Layer):
         self._num_channels = num_channels
         self._num_groups = num_groups
         if data_format != 'NCHW':
-            raise ValueError("unsupported data layout:" + data_layout)
+            raise ValueError("unsupported data layout:" + data_format)
 
         param_shape = [self._num_channels]
 
@@ -1057,7 +1057,18 @@ class SyncBatchNorm(_BatchNormBase):
               self).__init__(num_features, momentum, epsilon, weight_attr,
                              bias_attr, data_format, None, name)
 
+    def _check_data_format(self):
+        if self._data_format in ['NCHW', 'NCDHW', 'NC', 'NCL']:
+            self._data_format = 'NCHW'
+        elif self._data_format in ["NHWC", "NDHWC", 'NLC']:
+            self._data_format = 'NHWC'
+        else:
+            raise ValueError(
+                'expected \'NCDHW\', \'NDHWC\', \'NCL\', \'NLC\', \'NC\', \'NCHW\', \'NHWC\' for data_format'
+            )
+
     def forward(self, x):
+        self._check_data_format()
         # create output
         # mean and mean_out share the same memory
         mean_out = self._mean
@@ -1142,11 +1153,12 @@ class SyncBatchNorm(_BatchNormBase):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            if layer._weight_attr != None and not isinstance(layer._weight_attr,
-                                                             bool):
+            if layer._weight_attr != None and not isinstance(
+                    layer._weight_attr,
+                    bool) and layer._weight_attr.name != None:
                 layer._weight_attr.name = layer._weight_attr.name + '_sync'
-            if layer._bias_attr != None and not isinstance(layer._weight_attr,
-                                                           bool):
+            if layer._bias_attr != None and not isinstance(
+                    layer._bias_attr, bool) and layer._bias_attr.name != None:
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
             layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index 6c10d9bc2690a09b23ed2238ddd548d65f21df36..dd088b18ca27d9b749e602988ebd3954dbaacebf 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -43,7 +43,10 @@ class Adadelta(Optimizer):
         epsilon (float): a small float number for numeric stability. Default 1.0e-6.
         rho (float): a floating point value indicating the decay rate. Default 0.95.
         parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
@@ -77,6 +80,27 @@ class Adadelta(Optimizer):
             adadelta.step()
             adadelta.clear_grad()
 
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            adadelta = paddle.optimizer.Adadelta(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                }],
+                weight_decay=0.01)                   
+            out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
     """
 
     _avg_squared_grad_acc_str = "_avg_squared_grad"
@@ -105,10 +129,16 @@ class Adadelta(Optimizer):
         self.type = "adadelta"
         self._epsilon = epsilon
         self._rho = rho
+        self._default_dict = {
+            'epsilon': epsilon,
+            'rho': rho,
+        }
 
     def _create_accumulators(self, block, parameters):
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
+        if isinstance(parameters, dict):
+            parameters = parameters.get('params')
 
         for p in parameters:
             self._add_accumulator(self._avg_squared_grad_acc_str, p)
@@ -118,6 +148,9 @@ class Adadelta(Optimizer):
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
 
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
+
         avg_squared_grad_acc = self._get_accumulator(
             self._avg_squared_grad_acc_str, param_and_grad[0])
         avg_squared_update_acc = self._get_accumulator(
@@ -142,3 +175,9 @@ class Adadelta(Optimizer):
             stop_gradient=True)
 
         return adadelta_op
+
+    def _update_param_group(self, parameters):
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._rho = parameters.get('rho', self._default_dict['rho'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index bb934e5a9262c778029df3b29d84b6dd7a71bde3..6238d32e9c49dfa4664f2e269f415c44f06ffb3f 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -45,16 +45,19 @@ class Adagrad(Optimizer):
             It can be a float value or a ``Variable`` with a float type.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-06.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-            It canbe a float value as coeff of L2 regularization or \
-            :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
-            If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
-            the regularization setting here in optimizer will be ignored for this parameter. \
-            Otherwise, the regularization setting here in optimizer will take effect. \
-            Default None, meaning there is no regularization.
+	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_paddle_regularizer_L1Decay`, :ref:`api_paddle_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_paddle_fluid_param_attr_aramAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies, 
             ClipGradByGlobalNorm, ClipGradByNorm and ClipGradByValue. Default None, 
@@ -81,6 +84,27 @@ class Adagrad(Optimizer):
             adagrad.step()
             adagrad.clear_grad()
 
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            adagrad = paddle.optimizer.Adagrad(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                }],
+                weight_decay=0.01)                   
+            out.backward()
+            adagrad.step()
+            adagrad.clear_grad()
+
     """
     _moment_acc_str = "moment"
 
@@ -103,10 +127,17 @@ class Adagrad(Optimizer):
         self.type = "adagrad"
         self._epsilon = epsilon
         self.initial_accumulator_value = initial_accumulator_value
+        self._default_dict = {
+            'epsilon': epsilon,
+            'initial_accumulator_value': initial_accumulator_value,
+        }
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
         for p in parameters:
             self._add_accumulator(
                 self._moment_acc_str,
@@ -116,6 +147,9 @@ class Adagrad(Optimizer):
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
 
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
+
         moment_acc = self._get_accumulator(self._moment_acc_str,
                                            param_and_grad[0])
         # Create the adagrad optimizer op
@@ -133,3 +167,11 @@ class Adagrad(Optimizer):
             stop_gradient=True)
 
         return adagrad_op
+
+    def _update_param_group(self, parameters):
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self.initial_accumulator_value = parameters.get(
+            'initial_accumulator_value',
+            self._default_dict['initial_accumulator_value'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 63ca462d1a26b8a17e540a1fac2284b77a523a21..baa6a307176dd5feb70f1b6f2201a89f298e6153 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -21,6 +21,7 @@ from ..fluid import unique_name
 from ..fluid.layer_helper import LayerHelper
 import warnings
 from ..fluid.dygraph import base as imperative_base
+from collections import defaultdict
 
 import paddle
 
@@ -63,16 +64,19 @@ class Adam(Optimizer):
         epsilon (float|Tensor, optional): A small float value for numerical stability.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 1e-08.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-            It canbe a float value as coeff of L2 regularization or \
-            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-            the regularization setting here in optimizer will be ignored for this parameter. \
-            Otherwise, the regularization setting here in optimizer will take effect. \
-            Default None, meaning there is no regularization.
+	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
@@ -126,6 +130,29 @@ class Adam(Optimizer):
             adam.step()
             adam.clear_grad()
 
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adam(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'beta1': 0.8
+                }],
+                weight_decay=0.01,
+                beta1=0.9)                   
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
@@ -172,6 +199,12 @@ class Adam(Optimizer):
         self._lazy_mode = lazy_mode
         self._multi_precision = multi_precision
         self._master_weights = {}
+        self._default_dict = {
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon,
+            'lazy_mode': lazy_mode,
+        }
 
     def _create_master_weight(self, param):
         assert isinstance(self.helper, LayerHelper)
@@ -241,6 +274,8 @@ class Adam(Optimizer):
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -257,6 +292,8 @@ class Adam(Optimizer):
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
 
         moment1 = self._get_accumulator(self._moment1_acc_str,
                                         param_and_grad[0])
@@ -274,6 +311,7 @@ class Adam(Optimizer):
         # create the adam optimize op
 
         if framework.in_dygraph_mode():
+
             _beta1 = self._beta1 if not isinstance(
                 self._beta1, Variable) else self._beta1.numpy().item(0)
             _beta2 = self._beta2 if not isinstance(
@@ -359,18 +397,43 @@ class Adam(Optimizer):
                 adam.step()
                 adam.clear_grad()
         """
-        params_grads = []
-        for param in self._parameter_list:
-            if param.stop_gradient:
-                continue
-            if param._grad_ivar() is not None:
-                grad_var = param._grad_ivar()
-                if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
-                ) and self.regularization is not None:
-                    raise RuntimeError(
-                        "Adam don't support weight_decay with sparse parameters, please set it to None."
-                    )
-                params_grads.append((param, grad_var))
-
-        optimize_ops = self._apply_optimize(
-            loss=None, startup_program=None, params_grads=params_grads)
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+                    if hasattr(grad_var, "_is_sparse") and grad_var._is_sparse(
+                    ) and self.regularization is not None:
+                        raise RuntimeError(
+                            "Adam don't support weight_decay with sparse parameters, please set it to None."
+                        )
+                    params_grads.append((param, grad_var))
+
+            optimize_ops = self._apply_optimize(
+                loss=None, startup_program=None, params_grads=params_grads)
+        else:
+            # optimize parameters in groups
+            for param_group in self._param_groups:
+                params_grads = defaultdict(lambda: list())
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        params_grads['params'].append((param, grad_var))
+                params_grads.update(
+                    {k: v
+                     for k, v in param_group.items() if k != 'params'})
+                self._apply_optimize(
+                    loss=None, startup_program=None, params_grads=params_grads)
+
+    def _update_param_group(self, parameters):
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._lazy_mode = parameters.get('lazy_mode',
+                                         self._default_dict['lazy_mode'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index 44ae89f49d1c0502a2f18ca9c4d58f10a6a9a69e..867b7703720ba3ffac3004ad886240fb53fc39ee 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -55,16 +55,19 @@ class Adamax(Optimizer):
             The default value is 0.999.
         epsilon (float, optional): A small float value for numerical stability.
             The default value is 1e-08.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-            It canbe a float value as coeff of L2 regularization or \
-            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-            the regularization setting here in optimizer will be ignored for this parameter. \
-            Otherwise, the regularization setting here in optimizer will take effect. \
-            Default None, meaning there is no regularization.
+	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of 
             some derived class of ``GradientClipBase`` . There are three cliping strategies 
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , 
@@ -100,6 +103,29 @@ class Adamax(Optimizer):
             adam.step()
             adam.clear_grad()
 
+
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.Adamax(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'beta1': 0.8
+                }],
+                weight_decay=0.01,
+                beta1=0.9)                   
+            out.backward()
+            adam.step()
+            adam.clear_grad()
     """
     _moment_acc_str = "moment"
     _inf_norm_acc_str = "inf_norm"
@@ -134,8 +160,16 @@ class Adamax(Optimizer):
         self._beta1 = beta1
         self._beta2 = beta2
         self._epsilon = epsilon
+        self._default_dict = {
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon
+        }
 
     def _create_accumulators(self, block, parameters):
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
+
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
@@ -148,6 +182,8 @@ class Adamax(Optimizer):
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
 
         moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
         inf_norm = self._get_accumulator(self._inf_norm_acc_str,
@@ -183,16 +219,40 @@ class Adamax(Optimizer):
         """Update Beta1 Power accumulator
         """
         assert isinstance(block, framework.Block)
-        for param, grad in parameters_and_grads:
-            if grad is None or param.stop_gradient is True:
-                continue
-            with param.block.program._optimized_guard(
-                [param, grad]), name_scope('adamax'):
-                beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                      param)
-                block.append_op(
-                    type="scale",
-                    inputs={"X": beta1_pow_acc},
-                    outputs={"Out": beta1_pow_acc},
-                    attrs={"scale": self._beta1},
-                    stop_gradient=True)
+        if isinstance(parameters_and_grads, list):
+            for param, grad in parameters_and_grads:
+                if grad is None or param.stop_gradient is True:
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), name_scope('adamax'):
+                    beta1_pow_acc = self._get_accumulator(
+                        self._beta1_pow_acc_str, param)
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True)
+        else:
+            for param, grad in parameters_and_grads['params']:
+                if grad is None or param.stop_gradient is True:
+                    continue
+                with param.block.program._optimized_guard(
+                    [param, grad]), name_scope('adamax'):
+                    beta1_pow_acc = self._get_accumulator(
+                        self._beta1_pow_acc_str, param)
+                    self._beta1 = parameters_and_grads.get(
+                        'beta1', self._default_dict['beta1'])
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True)
+
+    def _update_param_group(self, parameters):
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index 304f0b771826c946b7a28f17959aef7d426174c4..c3cffa2998f6cc0956412be7709251720f8a51db 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -45,9 +45,12 @@ class AdamW(Adam):
     Args:
         learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
             It can be a float value or a LRScheduler. The default value is 0.001.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
+	parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+	    The default value is None in static mode, at this time all parameters will be updated.
         beta1 (float|Tensor, optional): The exponential decay rate for the 1st moment estimates.
             It should be a float number or a Tensor with shape [1] and data type as float32.
             The default value is 0.9.
@@ -101,6 +104,30 @@ class AdamW(Adam):
             adam.step()
             adam.clear_grad()
 
+
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            adam = paddle.optimizer.AdamW(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'beta1': 0.8
+                }],
+                weight_decay=0.01,
+                beta1=0.9)                   
+            out.backward()
+            adam.step()
+            adam.clear_grad()
+
     """
 
     def __init__(self,
@@ -143,6 +170,7 @@ class AdamW(Adam):
             name=name,
             lazy_mode=lazy_mode,
             multi_precision=multi_precision)
+        self._default_dict = {'coeff': coeff}
 
     def _append_decoupled_weight_decay(self, block, param_and_grad):
         """
@@ -156,7 +184,10 @@ class AdamW(Adam):
         Raises:
             Exception: The type of coeff and parameter is not consistent.
         """
-        param, grad = param_and_grad
+        if not isinstance(param_and_grad, dict):
+            param, grad = param_and_grad
+        else:
+            param, grad = self._update_param_group(param_and_grad)
 
         if self._apply_decay_param_fun is not None \
                 and not self._apply_decay_param_fun(param.name):
@@ -207,3 +238,8 @@ class AdamW(Adam):
 
     def __str__(self):
         return " ".join(["Weight Decay, params:", ",".join(self._params_name)])
+
+    def _update_param_group(self, parameters):
+        self._coeff = parameters.get('coeff', self._default_dict['coeff'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/lamb.py b/python/paddle/optimizer/lamb.py
index bff24e71c815366b6d12108436a82edb27d271a7..b2044ab3ca1715b749f074a4737cfc092aa29666 100644
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
@@ -59,7 +59,10 @@ class Lamb(Optimizer):
             Default 0.999.
         epsilon (float, optional): A small float value for numerical stability. Default 1e-6.
         parameters (Iterable, optional):  Iterable of ``Variable`` names to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
             The default value is None in static mode, at this time all parameters will be updated.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
@@ -83,6 +86,31 @@ class Lamb(Optimizer):
             back = out.backward()
             lamb.step()
             lamb.clear_grad()
+
+
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            lamb = paddle.optimizer.Lamb(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'lamb_weight_decay': 0.02
+                }],
+                weight_decay=0.01,
+                lamb_weight_decay=0.01)                   
+            out.backward()
+            lamb.step()
+            lamb.clear_grad()
+
     """
     _moment1_acc_str = "moment1"
     _moment2_acc_str = "moment2"
@@ -115,9 +143,18 @@ class Lamb(Optimizer):
         self._epsilon = epsilon
         self._lamb_weight_decay = lamb_weight_decay
         self._exclude_from_weight_decay_fn = exclude_from_weight_decay_fn
+        self._default_dict = {
+            'beta1': beta1,
+            'beta2': beta2,
+            'epsilon': epsilon,
+            'lamb_weight_decay': lamb_weight_decay,
+            'exclude_from_weight_decay_fn': exclude_from_weight_decay_fn,
+        }
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
+        if isinstance(parameters, dict):
+            parameters = self._update_param_group(parameters)
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -140,6 +177,9 @@ class Lamb(Optimizer):
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
+
         block.program._use_lamb = True
 
         moment1 = self._get_accumulator(self._moment1_acc_str,
@@ -199,3 +239,15 @@ class Lamb(Optimizer):
             stop_gradient=True)
 
         return lamb_op
+
+    def _update_param_group(self, parameters):
+        self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
+        self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._lamb_weight_decay = parameters.get(
+            'lamb_weight_decay', self._default_dict['lamb_weight_decay'])
+        self._exclude_from_weight_decay_fn = parameters.get(
+            'exclude_from_weight_decay_fn',
+            self._default_dict['exclude_from_weight_decay_fn'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 372143553e0c39988f5d0456125ee91bb94d3329..faff090bcb1f4ec2e906d2a3071930176a9c339f 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -51,8 +51,11 @@ class Momentum(Optimizer):
         learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
             It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
         momentum (float): Momentum factor. The default value is 0.9.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
+        parameters (list|tuple, optional): List|Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
@@ -88,6 +91,29 @@ class Momentum(Optimizer):
             back = out.backward()
             momentum.step()
             momentum.clear_grad()
+
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            momentum = paddle.optimizer.Momentum(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1
+                }],
+                weight_decay=0.01,
+                momentum=0.9)                   
+            out.backward()
+            momentum.step()
+            momentum.clear_grad()
+
     """
     _velocity_acc_str = "velocity"
 
@@ -105,7 +131,19 @@ class Momentum(Optimizer):
             raise ValueError("learning_rate is not set")
         if momentum is None:
             raise ValueError("momentum is not set")
+
         predicate = lambda regular: isinstance(regular, (L2DecayRegularizer, float))
+        if isinstance(parameters, list):
+            if isinstance(parameters[0], dict):
+                for param_group in parameters:
+                    decay = param_group[
+                        'weight_decay'] if 'weight_decay' in param_group else weight_decay
+                    reg_method, reg_coeff = self._update_regularization(decay)
+                    param_group['regularization_method'] = reg_method
+                    param_group['regularization_coeff'] = reg_coeff
+                    py_regular = None if predicate(decay) else decay
+                    param_group['weight_decay'] = py_regular
+
         py_regular = None if predicate(weight_decay) else weight_decay
         super(Momentum, self).__init__(
             learning_rate=learning_rate,
@@ -116,22 +154,41 @@ class Momentum(Optimizer):
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
-        self._regularization_method = ""
-        self._regularization_coeff = 0
-        if (isinstance(weight_decay, L2DecayRegularizer)):
-            self._regularization_method = "l2_decay"
-            self._regularization_coeff = weight_decay._regularization_coeff
-        if (isinstance(weight_decay, float)):
-            self._regularization_method = "l2_decay"
-            self._regularization_coeff = weight_decay
+        self._regularization_method, self._regularization_coeff = self._update_regularization(
+            weight_decay)
         self._multi_precision = multi_precision
         self._rescale_grad = rescale_grad
         self._master_weights = {}
 
+        self._default_dict = {
+            'momentum': momentum,
+            'use_nesterov': use_nesterov,
+            'rescale_grad': rescale_grad,
+            'regularization_method': self._regularization_method,
+            'regularization_coeff': self._regularization_coeff,
+        }
+
         if framework.in_dygraph_mode():
             self.helper = LayerHelper(self.__class__.__name__)
-            for p in parameters:
-                self._add_accumulator(self._velocity_acc_str, p)
+            if isinstance(self._parameter_list[0], dict):
+                for parameters in self._param_groups:
+                    for p in parameters['params']:
+                        self._add_accumulator(self._velocity_acc_str, p)
+            else:
+                for p in parameters:
+                    self._add_accumulator(self._velocity_acc_str, p)
+
+    def _update_regularization(self, weight_decay):
+        reg_method = ""
+        reg_coeff = 0
+
+        if (isinstance(weight_decay, L2DecayRegularizer)):
+            reg_method = "l2_decay"
+            reg_coeff = weight_decay._regularization_coeff
+        if (isinstance(weight_decay, float)):
+            reg_method = "l2_decay"
+            reg_coeff = weight_decay
+        return reg_method, reg_coeff
 
     def _create_master_weight(self, param):
         assert isinstance(self.helper, LayerHelper)
@@ -197,12 +254,16 @@ class Momentum(Optimizer):
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
 
         velocity_acc = self._get_accumulator(self._velocity_acc_str,
                                              param_and_grad[0])
         lr = self._create_param_lr(param_and_grad)
 
         if framework.in_dygraph_mode():
+            if isinstance(param_and_grad, dict):
+                self._update_regularization(param_and_grad['weight_decay'])
             _, _ = core.ops.momentum(
                 param_and_grad[0], param_and_grad[1], velocity_acc, lr,
                 param_and_grad[0], velocity_acc, 'mu', self._momentum,
@@ -250,3 +311,18 @@ class Momentum(Optimizer):
             stop_gradient=True)
 
         return momentum_op
+
+    def _update_param_group(self, parameters):
+        self._momentum = parameters.get('momentum',
+                                        self._default_dict['momentum'])
+        self._use_nesterov = parameters.get('use_nesterov',
+                                            self._default_dict['use_nesterov'])
+        self._rescale_grad = parameters.get('rescale_grad',
+                                            self._default_dict['rescale_grad'])
+        self._regularization_method = parameters.get(
+            'regularization_method',
+            self._default_dict['regularization_method'])
+        self._regularization_coeff = parameters.get(
+            'regularization_coeff', self._default_dict['regularization_coeff'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index b06bd2a2b0be9539ed33f5c898da7d15f92a09a6..0f22b920b17deba923b945115f4f274c84f2ddf6 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -28,7 +28,7 @@ from ..fluid import layers
 from ..fluid import unique_name
 from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
 from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
-from ..fluid.framework import program_guard
+from ..fluid.framework import program_guard, Parameter
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.layers import ops
@@ -41,6 +41,7 @@ from functools import reduce
 from ..fluid.wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
 from .lr import LRScheduler
+import copy
 
 __all__ = []
 
@@ -56,7 +57,10 @@ class Optimizer(object):
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
             It can be a float value or any subclass of ``LRScheduler`` .
         parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
             The default value is None in static mode, at this time all parameters will be updated.
         weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
             It canbe a float value as coeff of L2 regularization or \
@@ -91,6 +95,29 @@ class Optimizer(object):
             adam.step()
             adam.clear_grad()
 
+            #Take the subclass sgd as an example
+            #optimize parameters in linear_1 and linear2 in different options. 
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            sgd = paddle.optimizer.SGD(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1
+                }],
+                weight_decay=0.01)                   
+            out.backward()
+            sgd.step()
+            sgd.clear_grad()
+
     """
 
     @imperative_base.no_grad
@@ -100,6 +127,7 @@ class Optimizer(object):
                  weight_decay=None,
                  grad_clip=None,
                  name=None):
+
         if parameters is not None:
             # paddle.Tensor is also iterable, so here we don't check whether
             # the input is iterable, if the input is paddle.Tensor, the
@@ -109,6 +137,11 @@ class Optimizer(object):
                     "`parameters` argument given to the optimizer should be "
                     "an iterable of paddle Tensors, but got argument type is `{}`.".
                     format(type(parameters)))
+            if isinstance(parameters, dict):
+                raise TypeError(
+                    "`parameters` argument should not get dict type, "
+                    "if parameter groups is needed, please set `parameters`"
+                    " as list of dict")
             self._parameter_list = list(parameters)
         else:
             self._parameter_list = None
@@ -120,14 +153,17 @@ class Optimizer(object):
                     "parameters argument given to the Optimizer should not be None in dygraph mode."
                 )
             if weight_decay is not None:
-                for param in self._parameter_list:
-                    if hasattr(param,
-                               'regularizer') and param.regularizer is not None:
-                        logging.info(
-                            "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
-                            "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                            % weight_decay.__str__())
-                        break
+                if not isinstance(self._parameter_list[0], dict):
+                    for param in self._parameter_list:
+                        if hasattr(
+                                param,
+                                'regularizer') and param.regularizer is not None:
+                            logging.info(
+                                "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
+                                "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
+                                % weight_decay.__str__())
+                            break
+
         if not isinstance(learning_rate, (float, LRScheduler)):
             raise TypeError(
                 "learning rate should be float or LRScheduler, got %s here" %
@@ -148,7 +184,13 @@ class Optimizer(object):
         self._dtype = None
         # Infer the dtype form parameter
         if self._parameter_list:
-            self._dtype = self._parameter_list[0].dtype
+            if isinstance(self._parameter_list[0], dict):
+                for param_group in self._parameter_list:
+                    assert 'params' in param_group, \
+                        'params should be set in parameters if parameter groups are optimized in different options'
+                self._dtype = self._parameter_list[0]['params'][0].dtype
+            else:
+                self._dtype = self._parameter_list[0].dtype
 
         # each program should have a independent learning rate
         # program -> tensor(learning_rate)
@@ -163,6 +205,18 @@ class Optimizer(object):
         self._accumulators_holder = {}
         self._param_device_map = dict()
         self.clear_gradients = self.clear_grad
+        self._default_dict = {
+            'learning_rate': self._learning_rate,
+            'weight_decay': self.regularization,
+            'grad_clip': self._grad_clip
+        }
+
+        self._param_groups = []
+        if self._parameter_list and isinstance(self._parameter_list[0], dict):
+            for param_group in self._parameter_list:
+                self._add_param_group(param_group.copy())
+        else:
+            self._param_groups = self._parameter_list
 
     @framework.dygraph_only
     def state_dict(self):
@@ -610,18 +664,45 @@ class Optimizer(object):
 
         start = len(target_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
-        self._update_param_device_map(parameters_and_grads, target_block)
-        self._create_accumulators(
-            target_block,
-            [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
+        params_grads_device_map = parameters_and_grads['params'] if isinstance(
+            parameters_and_grads, dict) else parameters_and_grads
+        self._update_param_device_map(params_grads_device_map, target_block)
+        if isinstance(parameters_and_grads, list):
+            self._create_accumulators(
+                target_block,
+                [p[0] for p in parameters_and_grads if not p[0].stop_gradient])
+
+        else:
+            params_acc_dict = parameters_and_grads.copy()
+            params_acc_dict['params'] = [
+                p[0] for p in params_acc_dict['params']
+                if not p[0].stop_gradient
+            ]
+            self._create_accumulators(target_block, params_acc_dict)
+
         self._create_global_learning_rate()
 
         if framework.in_dygraph_mode():
-            for param_and_grad in parameters_and_grads:
-                if param_and_grad[1] is None:
-                    continue
-                if param_and_grad[0].stop_gradient is False:
-                    self._append_optimize_op(target_block, param_and_grad)
+
+            if isinstance(parameters_and_grads, list):
+                for param_and_grad in parameters_and_grads:
+                    if param_and_grad[1] is None:
+                        continue
+                    if param_and_grad[0].stop_gradient is False:
+                        self._append_optimize_op(target_block, param_and_grad)
+            else:
+                for param_and_grad in parameters_and_grads['params']:
+                    if param_and_grad[1] is None:
+                        continue
+                    if param_and_grad[0].stop_gradient is False:
+                        param_grad_dict = dict()
+                        param_grad_dict['params'] = param_and_grad
+                        param_grad_dict.update({
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        })
+                        self._append_optimize_op(target_block, param_grad_dict)
         else:
             for param_and_grad in parameters_and_grads:
                 if param_and_grad[1] is None:
@@ -790,10 +871,19 @@ class Optimizer(object):
         if framework.in_dygraph_mode():
             with program_guard(framework.default_main_program(),
                                framework.default_startup_program()):
-                if self._grad_clip is not None:
-                    params_grads = self._grad_clip(params_grads)
-                params_grads = append_regularization_ops(params_grads,
-                                                         self.regularization)
+                if isinstance(params_grads, list):
+                    if self._grad_clip is not None:
+                        params_grads = self._grad_clip(params_grads)
+                    params_grads = append_regularization_ops(
+                        params_grads, self.regularization)
+                else:
+                    grad_clip = params_grads['grad_clip']
+                    if grad_clip is not None:
+                        params_grads['params'] = grad_clip(params_grads[
+                            'params'])
+
+                    params_grads['params'] = append_regularization_ops(
+                        params_grads['params'], self.regularization)
                 optimize_ops = self._create_optimization_pass(params_grads)
         else:
             program = loss.block.program
@@ -840,9 +930,16 @@ class Optimizer(object):
                 adam.clear_grad()
 
         """
-        for p in self._parameter_list:
-            if not p.stop_gradient:
-                p.clear_gradient()
+        if self._parameter_list is None or not isinstance(
+                self._parameter_list[0], dict):
+            for p in self._parameter_list:
+                if not p.stop_gradient:
+                    p.clear_gradient()
+        else:
+            for param_group in self._param_groups:
+                for p in param_group['params']:
+                    if not p.stop_gradient:
+                        p.clear_gradient()
 
     @imperative_base.no_grad
     def minimize(self,
@@ -934,13 +1031,82 @@ class Optimizer(object):
                 adam.step()
                 adam.clear_grad()
         """
-        params_grads = []
-        for param in self._parameter_list:
-            if param.stop_gradient:
-                continue
-            if param._grad_ivar() is not None:
-                grad_var = param._grad_ivar()
-                params_grads.append((param, grad_var))
-
-        self._apply_optimize(
-            loss=None, startup_program=None, params_grads=params_grads)
+
+        if not isinstance(self._param_groups[0], dict):
+            params_grads = []
+            for param in self._param_groups:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+                    params_grads.append((param, grad_var))
+
+            self._apply_optimize(
+                loss=None, startup_program=None, params_grads=params_grads)
+
+        else:
+            # optimize parameters in groups
+            for param_group in self._param_groups:
+                params_grads = defaultdict(lambda: list())
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        params_grads['params'].append((param, grad_var))
+                params_grads.update(
+                    {k: v
+                     for k, v in param_group.items() if k != 'params'})
+                self._apply_optimize(
+                    loss=None, startup_program=None, params_grads=params_grads)
+
+    def _add_param_group(self, param_group):
+        """
+        Add a param group to parameter_list.
+
+        Args:
+            param_group (dict): The group of Tensors to be optimzed with
+            different optimization options.
+        """
+        params = param_group['params']
+        if isinstance(params, Parameter):
+            param_group['params'] = [params]
+        elif isinstance(params, set):
+            raise TypeError(
+                "optimizer parameters should be in ordered collections,"
+                "but received set, please use list instead.")
+        else:
+            param_group['params'] = list(params)
+
+        # Update optimization options for each groups
+        for k, v in self._default_dict.items():
+            param_group.setdefault(k, v)
+
+        param_set = set()
+        for group in self._param_groups:
+            param_set.update(set(group['params']))
+
+        if not param_set.isdisjoint(set(param_group['params'])):
+            raise ValueError(
+                "some parameters appear in more than one parameter group")
+
+        for param in param_group['params']:
+            weight_decay = param_group['weight_decay']
+            if isinstance(weight_decay, float):
+                from ..fluid.regularizer import L2Decay
+                regularization = L2Decay(weight_decay)
+            else:
+                regularization = weight_decay
+            param.regularizer = regularization
+            param.optimize_attr['learning_rate'] = param_group['learning_rate']
+
+        self._param_groups.append(param_group)
+
+    def _update_param_group(self, parameters):
+        """
+        Update the param group with new entry
+        Args:
+            parameters (dict): The extra group of Tensors to be optimzed with
+            different optimization options. Only used in child class.
+        """
+        pass
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index b0bb0228c8ca82acc40b62e1a9074636b4def097..14249df3f5628fff3823e770d843f5af0a7e8c1e 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -80,16 +80,19 @@ class RMSProp(Optimizer):
             the gradient; if False, by the uncentered second moment. Setting this to
             True may help with training, but is slightly more expensive in terms of
             computation and memory. Defaults to False.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
-            It canbe a float value as coeff of L2 regularization or \
-            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
-            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
-            the regularization setting here in optimizer will be ignored for this parameter. \
-            Otherwise, the regularization setting here in optimizer will take effect. \
-            Default None, meaning there is no regularization.
+	parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+	    This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+	    The default value is None in static mode, at this time all parameters will be updated.
+	weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+	    It canbe a float value as coeff of L2 regularization or \
+	    :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+	    If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+	    the regularization setting here in optimizer will be ignored for this parameter. \
+	    Otherwise, the regularization setting here in optimizer will take effect. \
+	    Default None, meaning there is no regularization.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
             some derived class of ``GradientClipBase`` . There are three cliping strategies
             ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
@@ -117,6 +120,26 @@ class RMSProp(Optimizer):
             rmsprop.step()
             rmsprop.clear_grad()
 
+            #Note that the learning_rate of linear_2 is 0.01.
+            linear_1 = paddle.nn.Linear(10, 10)
+            linear_2 = paddle.nn.Linear(10, 10)
+            inp = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
+            out = linear_1(inp)
+            out = linear_2(out)
+            loss = paddle.mean(out)
+            rmsprop = paddle.optimizer.RMSProp(
+                learning_rate=0.1,
+                parameters=[{
+                    'params': linear_1.parameters()
+                }, {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1
+                }],
+                weight_decay=0.01)                   
+            out.backward()
+            rmsprop.step()
+            rmsprop.clear_grad()
     """
 
     _momentum_acc_str = "momentum"
@@ -160,11 +183,20 @@ class RMSProp(Optimizer):
         self._epsilon = epsilon
         self._momentum = momentum
         self._centered = centered
+        self._default_dict = {
+            'rho': rho,
+            'epsilon': epsilon,
+            'momentum': momentum,
+            'centered': centered,
+        }
 
     def _create_accumulators(self, block, parameters):
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
 
+        if isinstance(parameters, dict):
+            parameters = parameters.get('params')
+
         for p in parameters:
             self._add_accumulator(self._momentum_acc_str, p)
             self._add_accumulator(self._mean_square_acc_str, p)
@@ -174,6 +206,9 @@ class RMSProp(Optimizer):
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
 
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
+
         momentum_acc = self._get_accumulator(self._momentum_acc_str,
                                              param_and_grad[0])
         mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
@@ -205,3 +240,13 @@ class RMSProp(Optimizer):
             stop_gradient=True)
 
         return rmsprop_op
+
+    def _update_param_group(self, parameters):
+        self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
+        self._rho = parameters.get('rho', self._default_dict['rho'])
+        self._momentum = parameters.get('momentum',
+                                        self._default_dict['momentum'])
+        self._centered = parameters.get('centered',
+                                        self._default_dict['centered'])
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index 4526034b405b0c97f1b06e07f3e4279cdc2d0d95..107581e060588af8b51744f87eba1278c6f1c1eb 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -87,6 +87,8 @@ class SGD(Optimizer):
 
     @no_grad
     def _append_optimize_op(self, block, param_and_grad):
+        if isinstance(param_and_grad, dict):
+            param_and_grad = self._update_param_group(param_and_grad)
         lr = self._create_param_lr(param_and_grad)
         if framework.in_dygraph_mode():
             core.ops.sgd(param_and_grad[0], lr, param_and_grad[1],
@@ -106,3 +108,7 @@ class SGD(Optimizer):
             stop_gradient=True)
 
         return sgd_op
+
+    def _update_param_group(self, parameters):
+        parameters = parameters.get('params')
+        return parameters
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index e1012e7656a3d3c00913403b886f1848ac6c3ce2..fb0244a41499a0c794ad9eb3005794beb3b951b1 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -40,9 +40,8 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Constructs a ``paddle.Tensor`` from ``data`` , 
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
 
-    If the ``data`` is already a tensor, and ``dtype`` or ``place`` does't change, no copy 
-    will be performed and return origin tensor, otherwise a new tensor will be constructed
-    and returned. 
+    If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
+    If you only want to change stop_gradient property, please call ``Tensor.stop_gradient = stop_gradient`` directly.
 
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
@@ -75,32 +74,31 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
         # <class 'paddle.Tensor'>
 
         paddle.to_tensor(1)
-        # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
         #        [1])
 
-        x = paddle.to_tensor(1)
-        paddle.to_tensor(x, dtype='int32', place=paddle.CPUPlace()) # A new tensor will be constructed due to different dtype or place
-        # Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,
+        x = paddle.to_tensor(1, stop_gradient=False)
+        print(x)
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=False,
         #        [1])
 
-        paddle.to_tensor((1.1, 2.2), place=paddle.CUDAPinnedPlace())
-        # Tensor(shape=[1], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,
-        #        [1])
+        paddle.to_tensor(x)  # A new tensor will be created with default stop_gradient=True
+        # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
+        #        [1])        
 
-        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CUDAPlace(0), stop_gradient=False)
-        # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+        paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False)
+        # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False,
         #        [[0.10000000, 0.20000000],
         #         [0.30000001, 0.40000001]])
 
         type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64'))
-        # <class 'paddle.VarBase'>
+        # <class 'paddle.Tensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
-        # Tensor(shape=[2, 2], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+        # Tensor(shape=[2, 2], dtype=complex64, place=CPUPlace, stop_gradient=True,
         #        [[(1+1j), (2+0j)],
         #         [(3+2j), (4+0j)]])
     """
-
     place = _get_paddle_place(place)
     if place is None:
         place = _current_expected_place()
@@ -119,10 +117,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
     if not isinstance(data, np.ndarray):
 
-        def _handle_diff_place_dtype(data, dtype, place, stop_gradient):
-            data.stop_gradient = stop_gradient
-            if not data.place._equals(place):
-                data = data._copy_to(place, False)
+        def _handle_dtype(data, dtype):
             if dtype:
                 if convert_dtype(dtype) != convert_dtype(data.dtype):
                     return data.astype(convert_dtype(dtype))
@@ -138,11 +133,17 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                     "this means the input data contains nested lists with different lengths. "
                 )
         elif isinstance(data, paddle.Tensor):
-            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
-        elif isinstance(data, (core.Tensor, core.LoDTensor)):
-            # convert LoDTensor to VarBase first, and then process it as input VarBase
+            data = data._copy_to(place, False)
+            ata = _handle_dtype(data, dtype)
+            data.stop_gradient = stop_gradient
+        elif isinstance(data, core.LoDTensor):
+            # convert LoDTensor to VarBase first
+            # Currenly, LoDTensor does no copy when places are same
             data = paddle.Tensor(data)
-            return _handle_diff_place_dtype(data, dtype, place, stop_gradient)
+            if not data.place._equals(place):
+                data = data._copy_to(place, False)
+            data = _handle_dtype(data, dtype)
+            data.stop_gradient = stop_gradient
         else:
             raise TypeError(
                 "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|numpy.ndarray|paddle.Tensor".
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 23addcb7e3f4e354a0331f79e8dd7986adcb8832..2f69946c52139be5c65bf5b2c38d0d17c9b58103 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -232,13 +232,11 @@ def add(x, y, name=None):
         print(z)  # [3., 8., 6. ]
 
     """
-    op_type = 'elementwise_add'
-    axis = -1
+
     if in_dygraph_mode():
-        return _elementwise_op_in_dygraph(
-            x, y, axis=axis, op_name=op_type)
+        return core.ops.elementwise_add(x, y)
 
-    return _elementwise_op(LayerHelper(op_type, **locals()))
+    return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
 @inplace_apis_in_dygraph_only
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 6045ac7d1e7274dc283206c5533b46170c05a621..dcaa1ca15e5dcbcdd221e89bcb64a9e280995f1e 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -42,10 +42,10 @@ if IS_WINDOWS and six.PY3:
     from unittest.mock import Mock
     _du_build_ext.get_export_symbols = Mock(return_value=None)
 
+CUDA_HOME = find_cuda_home()
 if core.is_compiled_with_rocm():
     ROCM_HOME = find_rocm_home()
-else:
-    CUDA_HOME = find_cuda_home()
+    CUDA_HOME = ROCM_HOME
 
 
 def setup(**attr):
@@ -427,8 +427,14 @@ class BuildExtension(build_ext, object):
                 elif isinstance(cflags, dict):
                     cflags = cflags['cxx']
 
+                # NOTE(Aurelius84): Since Paddle 2.0, we require gcc version > 5.x,
+                # so we add this flag to ensure the symbol names from user compiled
+                # shared library have same ABI suffix with core_(no)avx.so.
+                # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi
+                add_compile_flag(['-D_GLIBCXX_USE_CXX11_ABI=1'], cflags)
+
                 add_std_without_repeat(
-                    cflags, self.compiler.compiler_type, use_std14=False)
+                    cflags, self.compiler.compiler_type, use_std14=True)
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
diff --git a/python/setup.py.in b/python/setup.py.in
index 79c67182f9c7911aaae32dfaf660c49dbe683e1c..98d05c367f162330fd4f19d046c311bf01480399 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -404,15 +404,15 @@ headers = (
     list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost
     # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
     # to `extension/incude`,
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] +
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
 
 if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
-    headers += list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}'))   # errorMessage.pb for errormessage
+    # externalErrorMsg.pb for External Error message
+    headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}'))
 
 class InstallCommand(InstallCommandBase):
     def finalize_options(self):
diff --git a/tools/CrossStackProfiler/CspChromeTraceFormatter.py b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
new file mode 100755
index 0000000000000000000000000000000000000000..a8030988aacf1a922c41257a409c27274a5aba0a
--- /dev/null
+++ b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import six
+import sys
+import re
+import os
+import glob
+import unittest
+import pandas
+import tempfile
+import platform
+import pandas as pd
+
+
+class ChromeTraceFormatter(object):
+    def __init__(self):
+        self._events = []
+        self._metadata = []
+
+    def _create_event(self, ph, category, name, pid, tid, timestamp):
+        """Creates a new Chrome Trace event.
+
+        For details of the file format, see:
+        https://github.com/catapult-project/catapult/blob/master/tracing/README.md
+
+        Args:
+          ph:  The type of event - usually a single character.
+          category: The event category as a string.
+          name:  The event name as a string.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          timestamp:  The timestamp of this event as a long integer.
+
+        Returns:
+          A JSON compatible event object.
+        """
+        event = {}
+        event['ph'] = ph
+        event['cat'] = category
+        event['name'] = name
+        event['pid'] = pid
+        event['tid'] = tid
+        event['ts'] = timestamp
+        return event
+
+    def emit_pid(self, name, pid):
+        """Adds a process metadata event to the trace.
+
+        Args:
+          name:  The process name as a string.
+          pid:  Identifier of the process as an integer.
+        """
+        event = {}
+        event['name'] = 'process_name'
+        event['ph'] = 'M'
+        event['pid'] = pid
+        event['args'] = {'name': name}
+        self._metadata.append(event)
+
+    def emit_region(self, timestamp, duration, pid, tid, category, name, args):
+        """Adds a region event to the trace.
+
+        Args:
+          timestamp:  The start timestamp of this region as a long integer.
+          duration:  The duration of this region as a long integer.
+          pid:  Identifier of the process generating this event as an integer.
+          tid:  Identifier of the thread generating this event as an integer.
+          category: The event category as a string.
+          name:  The event name as a string.
+          args:  A JSON-compatible dictionary of event arguments.
+        """
+        event = self._create_event('X', category, name, pid, tid, timestamp)
+        event['dur'] = duration
+        event['args'] = args
+        self._events.append(event)
+
+    def emit_counter(self, category, name, pid, timestamp, counter, value):
+        """Emits a record for a single counter.
+
+        Args:
+            category: The event category as string
+            name: The event name as string
+            pid: Identifier of the process generating this event as integer
+            timestamp: The timestamps of this event as long integer
+            counter: Name of the counter as string
+            value: Value of the counter as integer
+            tid: Thread id of the allocation as integer
+        """
+        event = self._create_event('C', category, name, pid, 0, timestamp)
+        event['args'] = {counter: value}
+        self._events.append(event)
+
+    def format_to_string(self, pretty=False):
+        """Formats the chrome trace to a string.
+
+        Args:
+          pretty: (Optional.)  If True, produce human-readable JSON output.
+
+        Returns:
+          A JSON-formatted string in Chrome Trace format.
+        """
+        trace = {}
+        trace['traceEvents'] = self._metadata + self._events
+        if pretty:
+            return json.dumps(trace, indent=4, separators=(',', ': '))
+        else:
+            return json.dumps(trace, separators=(',', ':'))
+
+    def clear(self):
+        self._events = []
+        self._metadata = []
+
+
+if __name__ == "__main__":
+    pass
diff --git a/tools/CrossStackProfiler/CspFileReader.py b/tools/CrossStackProfiler/CspFileReader.py
new file mode 100755
index 0000000000000000000000000000000000000000..12de488aa693ebbdd0443bec7a2c7a25f35adffa
--- /dev/null
+++ b/tools/CrossStackProfiler/CspFileReader.py
@@ -0,0 +1,400 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import json
+import glob
+import logging
+import pandas as pd
+from multiprocessing import Process, Lock
+""" Some terms to clarify the code
+    in most case, one or more paremeters may be set as input args for a class or a function
+    in form of single variable or k-v dict
+
+    1.  trainerId
+    2.  gpuId
+    3.  rankId
+    4.  gpuPerTrainer
+    5.  groupSize
+    6.  groupId
+    7.  groupNum
+    8.  displaySize
+    9.  dataPath
+    10. resultPath
+    11. fileOrganizeForm -- "byRank" OR "byTrainer" or "other"
+
+"""
+
+PIPELINEINFO_TRACE_NUM = 1
+
+dcgmMetricParameterMap = {
+    "02_gpuUtility": [("GPUTL", "GPUTL"), ("GRACT", "GRACT")],
+    "03_smUtility": [("SMACT", "SMACT"), ("SMOCC", "SMOCC")],
+    "04_memUtility": [("FB_USED_RATIO", "FB_USED_RATIO"), ("DRAMA", "DRAMA")],
+    "05_txUtility": [("NVLTX", "NVLTX"), ("NVLRX", "NVLRX"), ("PCITX", "PCITX"),
+                     ("PCIRX", "PCIRX")],
+    "06_calUtility":
+    [("FP32A", "FP32A"), ("FP16A", "FP16A"), ("TENSO", "TENSO")]
+}
+DCGMINFO_TRACE_NUM = len(dcgmMetricParameterMap.keys())
+NETINFO_TRACE_NUM = 2
+
+DCGM_PATH = "dcgm"
+NET_PATH = "net"
+TIME_PATH = "time"
+PROFILE_PATH = "profile"
+
+FILEORGANIZEFORM_BYRANK = "byRank"
+FILEORGANIZEFORM_BYTRAINER = "byTrainer"
+FILEORGANIZEFORM_BYOTHER = "other"
+FILEORGANIZEFORM = [
+    FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER,
+    FILEORGANIZEFORM_BYOTHER
+]
+
+
+class FileReader(object):
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+        self._fileList = []
+        self._fileNum = 0
+
+        self._dataPath = ""
+        self._groupSize = 0
+        self._displaySize = 0
+        self._organizeForm = FILEORGANIZEFORM_BYOTHER
+        self._gpuPerTrainer = 0
+
+        self._checkArgs()
+        self._getFileList()
+
+        self._lock = Lock()
+
+    def printArgs(self):
+        self._logger.info("dataPath:")
+        self._logger.info(self._dataPath)
+        self._logger.info("groupSize:")
+        self._logger.info(self._groupSize)
+        self._logger.info("displaySize:")
+        self._logger.info(self._displaySize)
+        self._logger.info("organizeForm:")
+        self._logger.info(self._organizeForm)
+        self._logger.info("gpuPerTrainer:")
+        self._logger.info(self._gpuPerTrainer)
+        self._logger.info("minTimeStamp:")
+        self._logger.info(self._minTimeStamp)
+
+    def _checkArgsKey(self, key, type):
+        if not self._args.has_key(key):
+            raise KeyError("args should has key [%s]!" % key)
+
+        if not isinstance(self._args[key], type):
+            raise TypeError(
+                "Invalid type of key [%s] in args dict, it should be a %s!" %
+                (key, type))
+
+        exec("self._%s = self._args[\"%s\"]" % (key, key))
+
+    def _align_ts(self, ts):
+        return ts - self._minTimeStamp
+
+    def _checkArgs(self):
+        if not isinstance(self._args, dict):
+            raise TypeError("Invalid type of args, it should be a dict!")
+
+        self._checkArgsKey("organizeForm", str)
+        if self._organizeForm not in FILEORGANIZEFORM or \
+            self._organizeForm == FILEORGANIZEFORM_BYOTHER:
+            raise NotImplementedError(
+                "we have not known how to process this form of file [%s]!" %
+                self._organizeForm)
+
+        self._checkArgsKey("gpuPerTrainer", int)
+
+        self._checkArgsKey("dataPath", str)
+        if not os.path.exists(self._dataPath):
+            raise IOError("input data path [%s] not existed!" %
+                          (self._dataPath))
+
+        self._checkArgsKey("groupSize", int)
+        self._checkArgsKey("displaySize", int)
+        self._checkArgsKey("minTimeStamp", int)
+
+    def getFileListByGroup(self, groupId):
+        lIndext = 0
+        rIndext = 0
+
+        if self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            lIndext = groupId * self._groupSize
+            rIndext = (groupId + 1) * self._groupSize
+        elif self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            lIndext = groupId * self._groupSize * self._gpuPerTrainer
+            rIndext = (groupId + 1) * self._groupSize * self._gpuPerTrainer
+
+        try:
+            return self._fileList[lIndext:rIndext]
+        except IndexError:
+            raise IndexError("invalid index of file list")
+
+    def getFileList(self):
+        return self._getFileList
+
+    def _cmp(self, x, y):
+        return self._getId(x, self._organizeForm) - self._getId(
+            y, self._organizeForm)
+
+    def _getFileList(self):
+        self._fileList = glob.glob(os.path.join(self._dataPath, "*.*"))
+
+        # check unique
+        idList = []
+        newFileList = []
+        for file in self._fileList:
+            id = self._getId(file, self._organizeForm)
+            if id not in idList:
+                idList.append(id)
+                newFileList.append(file)
+            else:
+                raise NotImplementedError(
+                    "[%s] is repeated by id, we don not how to process it!" %
+                    file)
+
+        if not self._fileList:
+            if (self._getId(self._fileList[-1]) - self._getId(self._fileList[0])
+                ) != len(self._fileList) - 1:
+                raise Exception("The file id should be countious!")
+        # sort
+        def _sortBySuffix(elem):
+            return int(elem.split(".")[-1])
+
+        self._fileList.sort(key=_sortBySuffix)
+
+        if not self._fileList:
+            self._logger.warning("we can not find any file in dir [%s]!" %
+                                 self._dataPath)
+        else:
+            self._logger.info("file list in dir [%s] is : %s !" %
+                              (self._dataPath, ',  '.join(self._fileList)))
+
+        return self._fileList
+
+    def _getId(self, fileName, organizeForm, sed="."):
+        if self._organizeForm != organizeForm:
+            raise TypeError("Can not get rank id when organizer form is not %s!"
+                            % organizeForm)
+
+        if not os.path.isfile(fileName):
+            raise IOError("[%s] is not a valid file!" % (fileName))
+
+        try:
+            prefix_str = fileName.split(sed)[-1]
+            try:
+                return int(prefix_str)
+            except ValueError, Argument:
+                print(Argument)
+                raise TypeError("invalid fileName [%s]" % fileName)
+
+        except IndexError, Argument:
+            print(Argument)
+            raise TypeError(
+                "invalid fileName [%s], the prefix should be a number!" %
+                fileName)
+
+    def getRankId(self, fileName, sed="."):
+        return self._getId(fileName, FILEORGANIZEFORM_BYRANK, sed)
+
+    def getRankNum(self):
+        if self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            return len(self._fileList)
+
+        elif self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            return len(self._fileList) * self._gpuPerTrainer
+
+    def getTrainerNum(self):
+        if self._organizeForm == FILEORGANIZEFORM_BYRANK:
+            return len(self._fileList) / self._gpuPerTrainer
+
+        elif self._organizeForm == FILEORGANIZEFORM_BYTRAINER:
+            return len(self._fileList)
+
+    def getTrainerId(self, fileName, sed="."):
+        return self._getId(fileName, FILEORGANIZEFORM_BYTRAINER, sed)
+
+    def _splitTaskListForMultiProcess(self, ls, n):
+        if not isinstance(ls, list) or not isinstance(n, int):
+            return []
+        ls_len = len(ls)
+        if n <= 0 or 0 == ls_len:
+            return []
+        if n >= ls_len:
+            return [[i] for i in ls]
+        else:
+            j = int((ls_len + n - 1) / n)
+            k = ls_len % n
+            ls_return = []
+            end = 0
+            for i in range(0, (n) * j, j):
+                if i < len(ls) and (i + j) < len(ls):
+                    ls_return.append(ls[i:i + j])
+                    end = i + j
+            ls_return.append(ls[end:])
+            return ls_return
+
+    def getOpInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("opinfo", groupId, gpuId, tmpPath)
+
+    def getPipeLineInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("pipilineinfo", groupId, gpuId, tmpPath)
+
+    def getDCGMInfoFileName(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getFileName("dcgm", groupId, gpuId, tmpPath)
+
+    def getFileName(self, name, groupId, gpuId, tmpPath="./tmp"):
+        return os.path.join(tmpPath, "%s_%d_%d.json" % (name, groupId, gpuId))
+
+    def getOpInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getDict("opinfo", groupId, gpuId, tmpPath)
+
+    def getDcgmInfoDict(self, groupId, gpuId, tmpPath="./tmp"):
+        return self.getDict("dcgm", groupId, gpuId, tmpPath)
+
+    def getDict(self, name, groupId, gpuId, tmpPath="./tmp"):
+        fileName = self.getFileName(name, groupId, gpuId, tmpPath)
+        if not os.path.isfile(fileName):
+            raise IOError("[%s] is not existed!" % fileName)
+
+        data = {}
+        with open(fileName, "r") as rf:
+            try:
+                data = json.load(rf)
+            except Exception:
+                self._logger.error("read [%s] error. not a json file!" %
+                                   (fileName))
+                raise TypeError("read [%s] error. not a json file!" %
+                                (fileName))
+        return data
+
+    def dumpOpInfoDict(self,
+                       data,
+                       groupId,
+                       gpuId,
+                       pretty=False,
+                       tmpPath="./tmp"):
+        return self.dumpDict(
+            data, "opinfo", groupId, gpuId, pretty=False, tmpPath="./tmp")
+
+    def dumpDCGMDict(self, data, groupId, gpuId, pretty=False, tmpPath="./tmp"):
+        return self.dumpDict(
+            data, "dcgm", groupId, gpuId, pretty=False, tmpPath="./tmp")
+
+    def dumpDict(self,
+                 data,
+                 name,
+                 groupId,
+                 gpuId,
+                 pretty=False,
+                 tmpPath="./tmp"):
+        self._lock.acquire()
+        if not os.path.exists(tmpPath):
+            os.makedirs(tmpPath)
+        self._lock.release()
+        if pretty:
+            jsObj = json.dumps(data, indent=4, separators=(',', ': '))
+        else:
+            jsObj = json.dumps(data, separators=(',', ':'))
+
+        fileName = self.getFileName(name, groupId, gpuId, tmpPath)
+        if os.path.isfile(fileName):
+            os.remove(fileName)
+
+        fileObject = open(fileName, 'w')
+        fileObject.write(jsObj)
+        fileObject.close()
+        self._logger.info("dump [%s] sucessfully!" % fileName)
+
+
+def getLogger():
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+
+    rq = time.strftime('%Y%m%d%H%M.%s', time.localtime(time.time()))
+    log_path = os.path.dirname(os.getcwd()) + '/Logs/'
+    if not os.path.exists(log_path):
+        os.makedirs(log_path)
+
+    log_name = log_path + rq + '.log'
+    logfile = log_name
+    fh = logging.FileHandler(logfile, mode='w')
+    fh.setLevel(logging.DEBUG)
+
+    formatter = logging.Formatter(
+        "%(asctime)s - %(filename)s[line:%(lineno)d] - %(process)d - %(levelname)s: %(message)s"
+    )
+    fh.setFormatter(formatter)
+
+    logger.addHandler(fh)
+    return logger
+
+
+def test_FileReader(args):
+    try:
+        testReader = FileReader(None, args)
+    except Exception, Argument:
+        print(Argument)
+    else:
+        testReader.printArgs()
+
+
+if __name__ == "__main__":
+    args = 0
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYOTHER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": "./res",
+        "groupSize": 1,
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
+
+    args = {
+        "dataPath": ".",
+        "groupSize": "",
+        "displaySize": 1,
+        "gpuPerTrainer": 8,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+    test_FileReader(args)
diff --git a/tools/CrossStackProfiler/CspReporter.py b/tools/CrossStackProfiler/CspReporter.py
new file mode 100755
index 0000000000000000000000000000000000000000..1b8ae0e3855348441e99e61fd302742852ac0156
--- /dev/null
+++ b/tools/CrossStackProfiler/CspReporter.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import glob
+import logging
+import argparse
+import multiprocessing
+
+import pandas as pd
+from multiprocessing import Process
+
+from NetFileReader import netFileReader
+from DCGMFileReader import dcgmFileReader
+from ProfileFileReader import profileFileReader
+
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+def get_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        '--profile_path',
+        type=str,
+        default='.',
+        help='Working path that store the monitor data.')
+
+    parser.add_argument(
+        '--timeline_path',
+        type=str,
+        default='.',
+        help='Output timeline file name.')
+
+    parser.add_argument(
+        '--gpuPerTrainer', type=int, default=8, help='Gpus per trainer.')
+
+    parser.add_argument(
+        '--trainerNum', type=int, default=4, help='Num of trainer.')
+
+    parser.add_argument(
+        '--groupSize', type=int, default=8, help='Num of trainer in a group.')
+
+    parser.add_argument(
+        '--displaySize',
+        type=int,
+        default=2,
+        help='Num of line need to display in a group.')
+
+    return parser.parse_args()
+
+
+class CspReporter(object):
+    def __init__(self, args):
+        self._args = args
+        print(self._args)
+
+        self._workPath = self._args.profile_path
+        self._saveFilePath = self._args.timeline_path
+        self._gpuPerTrainer = self._args.gpuPerTrainer
+        self._groupSize = self._args.groupSize
+        self._displaySize = self._args.displaySize
+        self._trainerNum = self._args.trainerNum
+
+        self._checkArgs()
+
+        self._init_logger()
+        self._init_timeInfo()
+        self._init_reader()
+
+    def _checkArgs(self):
+        if self._trainerNum % self._groupSize != 0:
+            raise Exception(
+                "Input args error: trainerNum[%d] %% groupSize[%d] != 0" %
+                (self._trainerNum, self._groupSize))
+
+    def _init_logger(self):
+        self._logger = getLogger()
+
+    def _init_reader(self):
+        self._dcgmPath = os.path.join(self._workPath, DCGM_PATH)
+        self._netPath = os.path.join(self._workPath, NET_PATH)
+        self._profilePath = os.path.join(self._workPath, PROFILE_PATH)
+
+        self._netFileReaderArgs = {
+            "dataPath": self._netPath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+        }
+
+        self._dcgmFileReaderArgs = {
+            "dataPath": self._dcgmPath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+        }
+
+        self._profileFileReaderArgs = {
+            "dataPath": self._profilePath,
+            "groupSize": self._groupSize,
+            "displaySize": self._displaySize,
+            "gpuPerTrainer": self._gpuPerTrainer,
+            "minTimeStamp": self._minTimeStamp,
+            "organizeForm": FILEORGANIZEFORM_BYRANK,
+        }
+
+        self._dcgmFileReader = dcgmFileReader(self._logger,
+                                              self._dcgmFileReaderArgs)
+        self._profileFileReader = profileFileReader(self._logger,
+                                                    self._profileFileReaderArgs)
+
+    def _init_timeInfo(self):
+        self._timePath = os.path.join(self._workPath, TIME_PATH)
+        self._timeInfo = {}
+        self._minTimeStamp = 0
+        self._set_timeInfo()
+
+    def _set_timeInfo(self, timeFileNamePrefix="time.txt", sed="."):
+        timeFileNameList = glob.glob(
+            os.path.join(self._timePath, timeFileNamePrefix, sed, "*"))
+        for timeFileName in timeFileNameList:
+            trainerId = int(timeFileName.split(sed)[-1])
+            gpuId = int(timeFileName.split(sed)[-2])
+            info = {}
+            with open(timeFileName, "r") as rf:
+                for line in rf:
+                    if line.startswith("start time:"):
+                        info["start_time"] = int(
+                            float(line.split(":")[-1]) * 1e9)
+
+                        self._minTimeStamp = min(self._minTimeStamp,
+                                                 info["start_time"])
+
+                    if line.startswith("end time:"):
+                        info["end_time"] = int(float(line.split(":")[-1]) * 1e9)
+            if not info:
+                self._timeInfo[gpuId * trainerId] = info
+
+    def _generateTraceFileByGroupAndGpuId(self, pipileInfo, netInfo, groupId,
+                                          gpuId):
+        dcgmInfoDict = self._dcgmFileReader.getDcgmInfoDict(groupId, gpuId)
+        opInfoDict = self._profileFileReader.getOpInfoDict(groupId, gpuId)
+
+        traceObj = {}
+        traceObj["traceEvents"] = pipileInfo[str(gpuId)] + opInfoDict[
+            "traceEvents"] + dcgmInfoDict["traceEvents"] + netInfo[
+                "traceEvents"]
+
+        self._profileFileReader.dumpDict(traceObj, "traceFile", groupId, gpuId,
+                                         False, self._saveFilePath)
+
+    def _generateTraceFileByGroup(self, groupId, processNum):
+        # first we need to generate pipeline info
+        pipileInfo = self._profileFileReader.getPipeLineInfo(groupId,
+                                                             processNum)
+        # second we need to generate dcgm info
+        dcgmInfo = self._dcgmFileReader.getDCGMTraceInfo(groupId, processNum)
+
+        # third we need to generate net info
+        netInfo = {}
+        netInfo["traceEvents"] = []
+        # netInfo = self._netFileReader.parseFileByGroup(groupId, processNum)
+
+        # forth we need to generate op info
+        opInfo = self._profileFileReader.getOPTraceInfo(groupId)
+
+        # finially we need dump this information into disk
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._generateTraceFileByGroupAndGpuId,
+                args=(
+                    pipileInfo,
+                    netInfo,
+                    groupId,
+                    gpuId, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[traceFile]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[traceFile]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+    def generateTraceFile(self, processNum=8):
+        processPool = []
+        pidList = []
+        for groupId in range(self._trainerNum / self._groupSize):
+            subproc = Process(
+                target=self._generateTraceFileByGroup,
+                args=(
+                    groupId,
+                    processNum, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[GroupTraceFile]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[GroupTraceFile]: process [%d] has exited! remained %d process!"
+                % (t.pid, len(pidList)))
+
+
+if __name__ == '__main__':
+    args = get_argparse()
+    tl = CspReporter(args)
+    tl.generateTraceFile()
diff --git a/tools/CrossStackProfiler/DCGMFileReader.py b/tools/CrossStackProfiler/DCGMFileReader.py
new file mode 100755
index 0000000000000000000000000000000000000000..599acb44c6556c1ecc43ad1e831355c201171e01
--- /dev/null
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
@@ -0,0 +1,269 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import json
+import glob
+import logging
+import tempfile
+import argparse
+import pandas as pd
+import multiprocessing
+from multiprocessing import Process
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import dcgmMetricParameterMap
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class dcgmFileReader(FileReader):
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        displaySize = min(self._displaySize, len(fileFist))
+        fileFist = fileFist[:displaySize]
+
+        if processNum == 0:
+            return self._parseTask(fileFist)
+
+        else:
+            self._logger.info("using [%d] process to do this work!" %
+                              processNum)
+            processPool = []
+            pidList = []
+
+            manager = multiprocessing.Manager()
+            q = manager.Queue()
+
+            taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+            for task in taskList:
+                subproc = Process(
+                    target=self._parseTask, args=(
+                        task,
+                        q, ))
+                processPool.append(subproc)
+                subproc.start()
+                pidList.append(subproc.pid)
+                self._logger.info(
+                    "[DCGM reader]: process [%d] has been started, total task num is %d ..."
+                    % (subproc.pid, len(processPool)))
+
+            for t in processPool:
+                t.join()
+                pidList.remove(t.pid)
+                self._logger.info(
+                    "[DCGM reader]: process [%d] has exited! remained %d process!"
+                    % (t.pid, len(pidList)))
+
+            isFistProcess = True
+            for t in processPool:
+                if isFistProcess:
+                    isFistProcess = False
+                    dcgm_data = q.get()
+                else:
+                    dcgm_data = pd.concat(
+                        [dcgm_data, q.get()], axis=0, join='outer')
+
+            return dcgm_data
+
+    def _parseTask(self, taskList, q=None):
+        is_first = True
+        for fileName in taskList:
+            self._logger.info("I am processing %s!" % fileName)
+            tmp_data = self._parseSingleFile(fileName)
+            if tmp_data is None:
+                continue
+
+            if is_first:
+                is_first = False
+                dcgm_data = tmp_data
+            else:
+                dcgm_data = pd.concat(
+                    [dcgm_data, tmp_data], axis=0, join='outer')
+        dcgm_data = dcgm_data.dropna()
+        if not q is None:
+            q.put(dcgm_data)
+        self._logger.info("I finish processing %s!" % fileName)
+        return dcgm_data
+
+    def _parseSingleFile(self, fileName):
+        trainerId = self.getTrainerId(fileName)
+
+        if not os.path.exists(fileName):
+            logging.warning(fileName + ' not found')
+            return
+
+        regex_list = [
+            (re.compile(r' +'), ','),
+            (re.compile(r'^,'), ''),
+        ]
+
+        csv_tempfile = tempfile.TemporaryFile()
+        with open(fileName, 'r') as fp:
+            has_header = False
+
+            for line in fp:
+                # skip `nvidia-dcgm-dmon.sh` init and fini info lines
+                if 'nv-hostengine' in line or 'dmon' in line or 'Host Engine Listener Started' in line:
+                    continue
+
+                if not line.strip().startswith("GPU") and not line.strip(
+                ).startswith("# Entity"):
+                    continue
+
+                # skip non-needed headers (only the header in 1th line was needed)
+                if line.strip().startswith("# Entity"):
+                    line = line.strip()[2:]
+
+                if 'Entity' == line[0:len('Entity')]:
+                    if has_header:
+                        continue
+                    else:
+                        has_header = True
+
+                if line.strip().startswith("GPU"):
+                    line = line.strip()[3:]
+
+                for r in regex_list:
+                    line = r[0].sub(r[1], line)
+
+                csv_tempfile.write(bytes(line + "\n"))
+
+        csv_tempfile.seek(0)
+
+        dcgm = pd.read_csv(csv_tempfile, header=0, delimiter=',')
+        # dcgm.info()
+        dcgm['FB_USED_RATIO'] = dcgm['FBUSD'] / dcgm['FBTTL']
+        dcgm['GPUTL'] = dcgm['GPUTL'] / 100.0
+        dcgm['ts'] = dcgm['TIMESTAMP'] * 1e9
+        dcgm['trainerId'] = trainerId
+
+        return dcgm
+
+    def _getDCGMTraceInfoByGpuId(self,
+                                 groupId,
+                                 gpuId,
+                                 dcgm_data,
+                                 pid_map,
+                                 q=None):
+        self._logger.info(
+            "Begin to generate dcgm info, groupId = %d, gpuID = %d ..." %
+            (groupId, gpuId))
+
+        gpuDcgmData = dcgm_data[dcgm_data['Entity'].isin([gpuId])]
+
+        traceEventList = []
+        for metric, parameteList in dcgmMetricParameterMap.items():
+            metaInfo = {}
+            metaInfo['name'] = 'process_name'
+            metaInfo['ph'] = 'M'
+            metaInfo['pid'] = pid_map[metric]
+            metaInfo['args'] = {'name': metric}
+            traceEventList.append(metaInfo)
+
+        for index, row in gpuDcgmData.iterrows():
+            for metric, parameteList in dcgmMetricParameterMap.items():
+                trainerId = int(row['trainerId']) % self._groupSize
+                if trainerId >= self._displaySize:
+                    continue
+
+                di = {}
+                # name = "%s_%d" % (metric, trainerId)
+                name = "%s" % (metric)
+                di['name'] = name
+                di['pid'] = pid_map[metric]
+                di['ts'] = self._align_ts(int(row['ts']))
+                # di['ts'] = int(row['ts'])
+                di['cat'] = metric
+                di['tid'] = "%d_%d" % (groupId, trainerId)
+                di['ph'] = "C"
+                di['id'] = trainerId
+
+                args = {}
+                for p in parameteList:
+                    args[p[0]] = row[p[1]]
+                di['args'] = args
+
+                traceEventList.append(di)
+        trace = {}
+        trace['traceEvents'] = traceEventList
+        self.dumpDCGMDict(trace, groupId, gpuId, True)
+
+        return trace
+
+    def getDCGMTraceInfo(self, groupId, processNum=8):
+        dcgm_data = self.parseFileByGroup(groupId, processNum)
+
+        pid_map = {}
+        init_pid = PIPELINEINFO_TRACE_NUM
+
+        for metric in dcgmMetricParameterMap.keys():
+            pid_map[metric] = init_pid
+            init_pid = init_pid + 1
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._getDCGMTraceInfoByGpuId,
+                args=(
+                    groupId,
+                    gpuId,
+                    dcgm_data,
+                    pid_map,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[DCGM info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[DCGM info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        dcgmInfo = {}
+
+        return dcgmInfo
+
+
+def test_dcgmFileReader():
+    args = {
+        "dataPath": "data/newdata/dcgm",
+        "groupSize": 4,
+        "displaySize": 8,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+
+    testReader = dcgmFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.getDCGMTraceInfo(0, 8)
+
+
+if __name__ == "__main__":
+    test_dcgmFileReader()
diff --git a/tools/CrossStackProfiler/NetFileReader.py b/tools/CrossStackProfiler/NetFileReader.py
new file mode 100755
index 0000000000000000000000000000000000000000..29c2ae85e60458f6b712e3eeadb739a2dee70d09
--- /dev/null
+++ b/tools/CrossStackProfiler/NetFileReader.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import glob
+import logging
+import pandas as pd
+
+from multiprocessing import Process
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class netFileReader(FileReader):
+    def _parseSingleFile(self, fileNameList, tx_pid, rx_pid, q=None):
+
+        traceInfo = {}
+        traceEventList = []
+
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = tx_pid
+        metaInfo['args'] = {'name': "%02d_tx" % tx_pid}
+
+        traceEventList.append(metaInfo)
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = rx_pid
+        metaInfo['args'] = {'name': "%02d_rx" % rx_pid}
+
+        traceEventList.append(metaInfo)
+
+        trainerIdList = []
+        for fileName in fileNameList:
+            trainerId = self.getTrainerId(fileName)
+            trainerIdList.append(trainerId)
+            with open(fileName, "r") as rf:
+                for line in rf:
+                    try:
+                        event_str = json.loads(line.strip())
+                        event_str["pid"] = tx_pid if event_str[
+                            "name"] == "tx" else rx_pid
+                        # the unit of net is ms, we need ns
+                        event_str["ts"] = self._align_ts(event_str["ts"] * 1e6)
+                        event_str["id"] = trainerId
+                        traceEventList.append(event_str)
+
+                    except Exception:
+                        self._logger.warning(
+                            "invalid record [%s] in [%s]. skip it!" %
+                            (line[:-1], fileName))
+        traceInfo["traceEvents"] = traceEventList
+
+        if not q is None:
+            q.put(traceInfo)
+        else:
+            return traceInfo
+
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        fileFist = fileFist[:min(self._displaySize, len(fileFist))]
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+
+        processPool = []
+        pidList = []
+        tx_pid = PIPELINEINFO_TRACE_NUM
+        rx_pid = PIPELINEINFO_TRACE_NUM + 1
+
+        taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+        for task in taskList:
+            subproc = Process(
+                target=self._parseSingleFile, args=(
+                    task,
+                    tx_pid,
+                    rx_pid,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[Net info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, len(processPool)))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[Net info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        traceInfo = {}
+        isFistProcess = True
+        for t in processPool:
+            if isFistProcess:
+                isFistProcess = False
+                traceInfo["traceEvents"] = q.get()["traceEvents"]
+            else:
+                traceInfo["traceEvents"].extend(q.get()["traceEvents"])
+
+        return traceInfo
+
+
+def test_netFileReader():
+    args = {
+        "dataPath": "data/newdata/net",
+        "groupSize": 4,
+        "displaySize": 2,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYTRAINER,
+    }
+
+    testReader = netFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.parseFileByGroup(0, 8)
+
+    jsObj = json.dumps(data, indent=4, separators=(',', ': '))
+    fileObject = open('jsonFile.json', 'w')
+    fileObject.write(jsObj)
+    fileObject.close()
+
+
+if __name__ == "__main__":
+    test_netFileReader()
diff --git a/tools/CrossStackProfiler/ProfileFileReader.py b/tools/CrossStackProfiler/ProfileFileReader.py
new file mode 100755
index 0000000000000000000000000000000000000000..0f3299ef5473fad6cde3c06ae99ba7727e1a7206
--- /dev/null
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import glob
+import json
+import logging
+import argparse
+import pandas as pd
+import multiprocessing
+from multiprocessing import Process
+
+import google.protobuf.text_format as text_format
+import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
+
+from CspChromeTraceFormatter import ChromeTraceFormatter
+
+from CspFileReader import FileReader
+from CspFileReader import getLogger
+from CspFileReader import TIME_PATH, DCGM_PATH, NET_PATH, PROFILE_PATH
+from CspFileReader import NETINFO_TRACE_NUM, DCGMINFO_TRACE_NUM, PIPELINEINFO_TRACE_NUM
+from CspFileReader import FILEORGANIZEFORM_BYRANK, FILEORGANIZEFORM_BYTRAINER, FILEORGANIZEFORM_BYOTHER, FILEORGANIZEFORM
+
+
+class profileFileReader(FileReader):
+    def _parseSingleFile(self, profile):
+        with open(profile, 'rb') as f:
+            profile_s = f.read()
+            profile_pb = profiler_pb2.Profile()
+            profile_pb.ParseFromString(profile_s)
+
+            return profile_pb
+
+    def _parseTask(self, taskList, q=None):
+        profile_dict = {}
+
+        for fileName in taskList:
+            rankId = self.getRankId(fileName)
+            profile_dict["trainerRank.%03d" %
+                         (rankId)] = self._parseSingleFile(fileName)
+            self._logger.info("I finish processing %s!" % fileName)
+
+        if not q is None:
+            q.put(profile_dict)
+
+        return profile_dict
+
+    def _is_forwardBackwardInfo(self, items):
+        if items["name"] == "marker/compute/MarkerCUDA":
+            if items.has_key("args"):
+                if isinstance(items["args"], dict):
+                    args = items["args"]
+                    if args.has_key("detail_info"):
+                        if args["detail_info"] == "marker_forward_B" or \
+                           args["detail_info"] == "marker_forward_E" or \
+                           args["detail_info"] == "marker_backward_B" or \
+                           args["detail_info"] == "marker_backward_E":
+                            return True
+        return False
+
+    def _allocate_forwardBackwardInfo(self, restList, pid, tid):
+        def _cmp_ele(items):
+            return items["ts"]
+
+        restList.sort(key=_cmp_ele)
+        newList = []
+
+        lastEle = {}
+        for items in restList:
+            if items["args"]["detail_info"].endswith("E"):
+                if not lastEle:
+                    continue
+                else:
+                    lastEle["dur"] = items["ts"] - lastEle["ts"]
+                    name = lastEle["args"]["detail_info"]
+                    name = name[:name.rfind('_')]
+                    name = name.split('_')[1]
+                    lastEle["name"] = name
+                    lastEle["args"]["detail_info"] = name
+                    lastEle["args"]["name"] = name
+                    if name == "backward":
+                        lastEle["cname"] = "good"
+                    else:
+                        lastEle["cname"] = "bad"
+
+                    lastEle["tid"] = tid
+                    lastEle["pid"] = pid
+
+                    newList.append(lastEle)
+            else:
+                lastEle = items
+
+        return newList
+
+    def _getPipeLineInfo(self, profileList, q=None):
+
+        res = {}
+        for profile in profileList:
+            rankId = self.getRankId(profile)
+
+            profile_pb = self._parseSingleFile(profile)
+            traceEventList = []
+            pid = 0
+            tid = rankId
+
+            for event in profile_pb.events:
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args['mem_bytes'] = event.memcopy.bytes
+                if hasattr(event, "detail_info") and event.detail_info:
+                    args['detail_info'] = event.detail_info
+
+                traceEvent = {}
+                traceEvent['ph'] = 'X'
+                traceEvent['cat'] = 'Op'
+                traceEvent['name'] = event.name
+                traceEvent['pid'] = pid
+                traceEvent['tid'] = tid
+                traceEvent['ts'] = self._align_ts(event.start_ns)
+                traceEvent['dur'] = (event.end_ns - event.start_ns) / 1.0
+                traceEvent['args'] = args
+
+                if self._is_forwardBackwardInfo(traceEvent):
+                    traceEventList.append(traceEvent)
+
+            pipeLineList = self._allocate_forwardBackwardInfo(traceEventList,
+                                                              pid, tid)
+
+            res[str(rankId)] = pipeLineList
+
+        if not q is None:
+            q.put(res)
+
+        return res
+
+    def getPipeLineInfo(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+
+        self._logger.info(
+            "using [%d] process to do this work, total task num is %d!" %
+            (processNum, len(fileFist)))
+        processPool = []
+        pidList = []
+
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+
+        taskList = self._splitTaskListForMultiProcess(fileFist, processNum)
+        for task in taskList:
+            subproc = Process(
+                target=self._getPipeLineInfo, args=(
+                    task,
+                    q, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[pipeline info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, len(task)))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[pipeline info]: process [%d] has exited! remained %d process!"
+                % (t.pid, len(pidList)))
+
+        pipeLineInfo = {}
+
+        metaInfo = {}
+        metaInfo['name'] = 'process_name'
+        metaInfo['ph'] = 'M'
+        metaInfo['pid'] = 0
+        metaInfo['args'] = {
+            'name': "%02d_pipeLineInfo" % PIPELINEINFO_TRACE_NUM
+        }
+
+        for t in processPool:
+            for k, v in q.get().items():
+                rankId = int(k)
+                gpuId = rankId % self._gpuPerTrainer
+                if str(gpuId) not in pipeLineInfo.keys():
+                    pipeLineInfo[str(gpuId)] = [metaInfo]
+                pipeLineInfo[str(gpuId)].extend(v)
+
+        return pipeLineInfo
+
+    def _allocate_pids(self, profile_dict, gpuId, initPid):
+        chrome_trace = ChromeTraceFormatter()
+        devices = dict()
+        mem_devices = dict()
+
+        initLineNum = initPid + 1
+        lineDelta = len(profile_dict.keys())
+        i = 0
+        for k, profile_pb in six.iteritems(profile_dict):
+            lineNum = initLineNum
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    if (k, event.device_id, "CPU") not in devices:
+                        pid = initPid
+                        initPid = initPid + 1
+                        devices[(k, event.device_id, "CPU")] = pid
+                        # -1 device id represents CUDA API(RunTime) call.(e.g. cudaLaunch, cudaMemcpy)
+                        if event.device_id == -1:
+                            chrome_trace.emit_pid("%02d_%s:cuda_api" %
+                                                  (lineNum, k), pid)
+                            lineNum = lineNum + 1
+                        else:
+                            chrome_trace.emit_pid("%02d_%s:cpu:block:%d" %
+                                                  (lineNum, k, event.device_id),
+                                                  pid)
+                            lineNum = lineNum + 1
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    if (k, event.device_id, "GPUKernel") not in devices:
+                        if gpuId == event.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            devices[(k, event.device_id, "GPUKernel")] = pid
+                            chrome_trace.emit_pid("%02d_%s:gpu:%d" %
+                                                  (lineNum, k, event.device_id),
+                                                  pid)
+                            lineNum = lineNum + 1
+
+            if not hasattr(profile_pb, "mem_events"):
+                continue
+            for mevent in profile_pb.mem_events:
+                if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
+                    if (k, mevent.device_id, "GPU") not in mem_devices:
+                        if gpuId == mevent.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            mem_devices[(k, mevent.device_id, "GPU")] = pid
+                            chrome_trace.emit_pid(
+                                "%02d_memory usage on %s:gpu:%d" %
+                                (lineNum, k, mevent.device_id), pid)
+                            lineNum = lineNum + 1
+                elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
+                    if (k, mevent.device_id, "CPU") not in mem_devices:
+                        pid = initPid
+                        initPid = initPid + 1
+
+                        mem_devices[(k, mevent.device_id, "CPU")] = pid
+                        chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
+                                              (lineNum, k, mevent.device_id),
+                                              pid)
+                        lineNum = lineNum + 1
+                elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
+                    if (k, mevent.device_id, "CUDAPinnedPlace"
+                        ) not in mem_devices:
+                        if gpuId == mevent.device_id:
+                            pid = initPid
+                            initPid = initPid + 1
+
+                            mem_devices[(k, mevent.device_id,
+                                         "CUDAPinnedPlace")] = pid
+                            chrome_trace.emit_pid(
+                                "%02d_memory usage on %s:cudapinnedplace:%d" %
+                                (lineNum, k, mevent.device_id), pid)
+                            lineNum = lineNum + 1
+                if (k, 0, "CPU") not in mem_devices:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "CPU")] = pid
+                    chrome_trace.emit_pid("%02d_memory usage on %s:cpu:%d" %
+                                          (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+                if (k, 0, "GPU") not in mem_devices:
+                    # if gpuId == mevent.device_id:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "GPU")] = pid
+                    chrome_trace.emit_pid("%02d_memory usage on %s:gpu:%d" %
+                                          (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+                if (k, 0, "CUDAPinnedPlace") not in mem_devices:
+                    pid = initPid
+                    initPid = initPid + 1
+
+                    mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
+                    chrome_trace.emit_pid(
+                        "%02d_memory usage on %s:cudapinnedplace:%d" %
+                        (lineNum, k, 0), pid)
+                    lineNum = lineNum + 1
+            i = i + 1
+        return chrome_trace, devices, mem_devices
+
+    def _allocate_events(self, profile_dict, devices, gpuId):
+        chrome_trace = ChromeTraceFormatter()
+        for k, profile_pb in six.iteritems(profile_dict):
+
+            rankId = int(k.split(".")[-1])
+
+            for event in profile_pb.events:
+                if event.type == profiler_pb2.Event.CPU:
+                    type = "CPU"
+                elif event.type == profiler_pb2.Event.GPUKernel:
+                    type = "GPUKernel"
+
+                if event.type == profiler_pb2.Event.GPUKernel and event.device_id != gpuId and rankId % self._gpuPerTrainer != gpuId:
+                    continue
+
+                pid = devices[(k, event.device_id, type)]
+                args = {'name': event.name}
+                if event.memcopy.bytes > 0:
+                    args['mem_bytes'] = event.memcopy.bytes
+                if hasattr(event, "detail_info") and event.detail_info:
+                    args['detail_info'] = event.detail_info
+                # TODO(panyx0718): Chrome tracing only handles ms. However, some
+                # ops takes micro-seconds. Hence, we keep the ns here.
+                chrome_trace.emit_region(
+                    self._align_ts(event.start_ns),
+                    (event.end_ns - event.start_ns) / 1.0, pid,
+                    event.sub_device_id, 'Op', event.name, args)
+        return chrome_trace
+
+    def _allocate_memory_event(self, profile_dict, mem_devices, gpuId):
+        chrome_trace = ChromeTraceFormatter()
+        if not hasattr(profiler_pb2, "MemEvent"):
+            return
+        place_to_str = {
+            profiler_pb2.MemEvent.CPUPlace: "CPU",
+            profiler_pb2.MemEvent.CUDAPlace: "GPU",
+            profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
+        }
+        for k, profile_pb in six.iteritems(profile_dict):
+            rankId = int(k.split(".")[-1])
+
+            trainerId = rankId / self._gpuPerTrainer
+
+            if trainerId >= self._displaySize:
+                continue
+
+            mem_list = []
+            end_profiler = 0
+            for mevent in profile_pb.mem_events:
+                crt_info = dict()
+                crt_info['time'] = mevent.start_ns
+                crt_info['size'] = mevent.bytes
+                if mevent.place in place_to_str:
+                    place = place_to_str[mevent.place]
+                else:
+                    place = "UnDefine"
+
+                if (mevent.place == profiler_pb2.MemEvent.CUDAPlace or
+                        mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace
+                    ) and mevent.device_id != gpuId:
+                    continue
+
+                crt_info['place'] = place
+                pid = mem_devices[(k, mevent.device_id, place)]
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                mem_list.append(crt_info)
+                crt_info = dict()
+                crt_info['place'] = place
+                crt_info['pid'] = pid
+                crt_info['thread_id'] = mevent.thread_id
+                crt_info['device_id'] = mevent.device_id
+                crt_info['time'] = mevent.end_ns
+                crt_info['size'] = -mevent.bytes
+                mem_list.append(crt_info)
+                end_profiler = max(end_profiler, crt_info['time'])
+            mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
+            i = 0
+            total_size = 0
+            while i < len(mem_list):
+                total_size += mem_list[i]['size']
+                while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
+                        i + 1]['time']:
+                    total_size += mem_list[i + 1]['size']
+                    i += 1
+
+                chrome_trace.emit_counter(
+                    "Memory", "Memory", mem_list[i]['pid'],
+                    self._align_ts(mem_list[i]['time']), 0, total_size)
+                i += 1
+        return chrome_trace
+
+    def _getOPTraceInfoByGpuId(self, groupId, gpuId):
+        fileFist = self.getFileListByGroup(groupId)
+        newFileList = []
+        for file in fileFist:
+            rankId = self.getRankId(file)
+            localRank = rankId % self._gpuPerTrainer
+            if localRank == gpuId and (rankId / self._gpuPerTrainer
+                                       ) % self._groupSize < self._displaySize:
+                newFileList.append(file)
+
+        profile_dict = self._parseTask(newFileList)
+        initPid = PIPELINEINFO_TRACE_NUM + DCGMINFO_TRACE_NUM + NETINFO_TRACE_NUM
+        metaTrace, devicesPid, mem_devicesPid = self._allocate_pids(
+            profile_dict, gpuId, initPid)
+        eventsTrace = self._allocate_events(profile_dict, devicesPid, gpuId)
+        memEventsTrace = self._allocate_memory_event(profile_dict,
+                                                     mem_devicesPid, gpuId)
+
+        trace = {}
+        trace[
+            'traceEvents'] = metaTrace._metadata + eventsTrace._events + memEventsTrace._events
+        self.dumpOpInfoDict(trace, groupId, gpuId, True)
+
+        return trace
+
+    def getOPTraceInfo(self, groupId):
+        manager = multiprocessing.Manager()
+        q = manager.Queue()
+        processPool = []
+        pidList = []
+
+        for gpuId in range(self._gpuPerTrainer):
+            subproc = Process(
+                target=self._getOPTraceInfoByGpuId, args=(
+                    groupId,
+                    gpuId, ))
+            processPool.append(subproc)
+            subproc.start()
+            pidList.append(subproc.pid)
+            self._logger.info(
+                "[op info]: process [%d] has been started, total task num is %d ..."
+                % (subproc.pid, 1))
+
+        for t in processPool:
+            t.join()
+            pidList.remove(t.pid)
+            self._logger.info(
+                "[op info]: process [%d] has exited! remained %d process!" %
+                (t.pid, len(pidList)))
+
+        opInfo = {}
+
+        return opInfo
+
+    def parseFileByGroup(self, groupId, processNum=8):
+        fileFist = self.getFileListByGroup(groupId)
+        if processNum == 0:
+            return self._parseTask(fileFist)
+        else:
+            return self._parseTask(fileFist)
+
+
+def test_profileFileReader():
+    args = {
+        "dataPath": "data/newdata/profile",
+        "groupSize": 4,
+        "displaySize": 8,
+        "gpuPerTrainer": 8,
+        "minTimeStamp": 0,
+        "organizeForm": FILEORGANIZEFORM_BYRANK,
+    }
+
+    testReader = profileFileReader(getLogger(), args)
+    testReader.printArgs()
+    data = testReader.getOPTraceInfo(0)
+
+    jsObj = json.dumps(data)
+    fileObject = open('jsonFile.json', 'w')
+    fileObject.write(jsObj)
+    fileObject.close()
+
+
+if __name__ == "__main__":
+    test_profileFileReader()
diff --git a/tools/CrossStackProfiler/__init__.py b/tools/CrossStackProfiler/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..6f0ea85344b7e0c679730356928c8749cf71cd66
--- /dev/null
+++ b/tools/CrossStackProfiler/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/cudaError/README.md b/tools/cudaError/README.md
deleted file mode 100644
index df7434c33a9fd7c6dfcf8c3cd7479169d748ca48..0000000000000000000000000000000000000000
--- a/tools/cudaError/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-Usage:
-
-Please run:
-```
-bash start.sh
-```
-
-The error message of CUDA9.0 / CUDA10.0 / CUDA-latest-version will be crawled by default.
-
-If you want to crawl a specified version of CUDA, Please run:
-```
-bash start.sh <version> <URL(optional)>
-```
-URL can be derived by default, so you don't have to enter a URL.
-
-for example:
-```
-bash start.sh 11.0
-```
-will capture error message of CUDA11.0(in future).
-
-Every time when Nvidia upgrade the CUDA major version, you need to run `bash start.sh` in current directory, and upload cudaErrorMessage.tar.gz to https://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz
diff --git a/tools/cudaError/spider.py b/tools/cudaError/spider.py
deleted file mode 100644
index c2c3dc97f422202e96d9c6ab58ce462e7dbd980e..0000000000000000000000000000000000000000
--- a/tools/cudaError/spider.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import ssl
-import re
-import urllib2
-import json
-import collections
-import sys, getopt
-import cuda_error_pb2
-
-
-def parsing(cuda_errorDesc, version, url):
-    All_Messages = cuda_errorDesc.AllMessages.add()
-    All_Messages.version = int(version)
-
-    ssl._create_default_https_context = ssl._create_unverified_context
-    html = urllib2.urlopen(url).read()
-    res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
-    m_div = re.findall(res_div, html, re.S | re.M)
-
-    url_list = url.split('/')
-    url_prefix = '/'.join(url_list[0:url_list.index('cuda-runtime-api') + 1])
-
-    dic = collections.OrderedDict()
-    dic_message = collections.OrderedDict()
-    for line in m_div:
-        res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
-        m_dt = re.findall(res_dt, line, re.S | re.M)
-        for error in m_dt:
-            res_type = r'<span class="ph ph apiData">(.*?)</span>'
-            m_type = re.findall(res_type, error[0], re.S | re.M)[0]
-            m_message = error[1]
-            m_message = m_message.replace('\n', '')
-            res_a = r'(<a class=.*?</a>)'
-            res_shape = r'<a class=.*?>(.*?)</a>'
-            list_a = re.findall(res_a, m_message, re.S | re.M)
-            list_shape = re.findall(res_shape, m_message, re.S | re.M)
-            assert len(list_a) == len(list_shape)
-            for idx in range(len(list_a)):
-                m_message = m_message.replace(list_a[idx], list_shape[idx])
-
-            m_message = m_message.replace(
-                '<h6 class=\"deprecated_header\">Deprecated</h6>', '')
-
-            res_span = r'(<span class=.*?</span>)'
-            res_span_detail = r'<span class=.*?>(.*?)</span>'
-            list_span = re.findall(res_span, m_message, re.S | re.M)
-            list_span_detail = re.findall(res_span_detail, m_message, re.S |
-                                          re.M)
-            assert len(list_span) == len(list_span_detail)
-            for idx in range(len(list_span)):
-                m_message = m_message.replace(list_span[idx],
-                                              list_span_detail[idx])
-
-            res_p = r'(<p>.*?</p>)'
-            res_p_detail = r'<p>(.*?)</p>'
-            list_p = re.findall(res_p, m_message, re.S | re.M)
-            list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
-            assert len(list_p) == len(list_p_detail)
-            for idx in range(len(list_p)):
-                m_message = m_message.replace(list_p[idx], list_p_detail[idx])
-
-            m_message = m_message.replace('  ', '')
-            _Messages = All_Messages.Messages.add()
-            try:
-                _Messages.errorCode = int(m_type)
-            except ValueError:
-                if re.match('0x', m_type):
-                    _Messages.errorCode = int(m_type, 16)
-                else:
-                    raise ValueError
-            _Messages.errorMessage = m_message  # save for cudaErrorMessage.pb from python-protobuf interface
-
-
-def main(argv):
-    version = []
-    url = []
-    try:
-        opts, args = getopt.getopt(argv, "hv:u:", ["help", "version=", "url="])
-    except getopt.GetoptError:
-        print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
-        sys.exit(2)
-    for opt, arg in opts:
-        if opt in ("-h", "--help"):
-            print 'python spider.py -v <version1,version2,...,> -u <url1,url2,...,>'
-            sys.exit()
-        elif opt in ("-v", "--version"):
-            version = arg
-        elif opt in ("-u", "--url"):
-            url = arg
-    version = version.split(',')
-    url = url.split(',')
-    assert len(version) == len(url)
-    cuda_errorDesc = cuda_error_pb2.cudaerrorDesc()
-    for idx in range(len(version)):
-        if version[idx] == "-1":
-            print("crawling errorMessage for CUDA%s from %s" %
-                  ("-latest-version", url[idx]))
-        else:
-            print("crawling errorMessage for CUDA%s from %s" %
-                  (version[idx], url[idx]))
-        parsing(cuda_errorDesc, version[idx], url[idx])
-
-    serializeToString = cuda_errorDesc.SerializeToString()
-    with open("cudaErrorMessage.pb", "wb") as f:
-        f.write(serializeToString
-                )  # save for cudaErrorMessage.pb from python-protobuf interface
-    print("crawling errorMessage for CUDA has been done!!!")
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
diff --git a/tools/externalError/README.md b/tools/externalError/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..029efd8cb949190135aeb12635e6951f5d979d7d
--- /dev/null
+++ b/tools/externalError/README.md
@@ -0,0 +1,9 @@
+Usage:
+
+Please run:
+```
+bash start.sh
+```
+
+If you want to update all external error message, you need to run command `bash start.sh` in current directory, 
+and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz
diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py
new file mode 100644
index 0000000000000000000000000000000000000000..a74d82f40ebebd3e59cd2b94715905e7157cbbef
--- /dev/null
+++ b/tools/externalError/spider.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ssl
+import re
+import urllib.request
+import json
+import collections
+import sys, getopt
+import external_error_pb2
+
+
+def parsing(externalErrorDesc):
+    #*********************************************************************************************#
+    #*********************************** CUDA Error Message **************************************#
+    print("start crawling errorMessage for nvidia CUDA API--->")
+    url = 'https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUDA
+
+    ssl._create_default_https_context = ssl._create_unverified_context
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+    res_div = r'<div class="section">.*?<p>CUDA error types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+    for error in m_dt:
+        res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
+        m_type = re.findall(res_type, error[0], re.S | re.M)[0]
+        m_message = error[1]
+        m_message = m_message.replace('\n', '')
+        res_a = r'(<a class=.*?</a>)'
+        res_shape = r'<a class=.*?>(.*?)</a>'
+        list_a = re.findall(res_a, m_message, re.S | re.M)
+        list_shape = re.findall(res_shape, m_message, re.S | re.M)
+        assert len(list_a) == len(list_shape)
+        for idx in range(len(list_a)):
+            m_message = m_message.replace(list_a[idx], list_shape[idx])
+
+        m_message = m_message.replace(
+            '<h6 class=\"deprecated_header\">Deprecated</h6>', '')
+
+        res_span = r'(<span class=.*?</span>)'
+        res_span_detail = r'<span class=.*?>(.*?)</span>'
+        list_span = re.findall(res_span, m_message, re.S | re.M)
+        list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
+        assert len(list_span) == len(list_span_detail)
+        for idx in range(len(list_span)):
+            m_message = m_message.replace(list_span[idx], list_span_detail[idx])
+
+        res_p = r'(<p>.*?</p>)'
+        res_p_detail = r'<p>(.*?)</p>'
+        list_p = re.findall(res_p, m_message, re.S | re.M)
+        list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
+        assert len(list_p) == len(list_p_detail)
+        for idx in range(len(list_p)):
+            m_message = m_message.replace(list_p[idx], list_p_detail[idx])
+
+        m_message = m_message.replace('  ', '')
+        _Messages = allMessageDesc.messages.add()
+        try:
+            _Messages.code = int(m_type[1])
+        except ValueError:
+            if re.match('0x', m_type[1]):
+                _Messages.code = int(m_type[1], 16)
+            else:
+                raise ValueError
+        _Messages.message = "'%s'. %s" % (m_type[0], m_message)
+    print("End crawling errorMessage for nvidia CUDA API!\n")
+
+    #***********************************************************************************************#
+    #*********************************** CURAND Error Message **************************************#
+    print("start crawling errorMessage for nvidia CURAND API--->")
+    url = 'https://docs.nvidia.com/cuda/curand/group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CURAND
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+
+    res_div = r'<div class="section">.*?<p>CURAND function call status types </p>.*?</div>.*?<div class="enum-members">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<dt>(.*?)</dt>.*?<dd>(.*?)</dd>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+    for error in m_dt:
+        res_type = r'<span class="enum-member-name-def">(.*?) = <span class="ph ph apiData">(.*?)</span></span>'
+        m_type = re.findall(res_type, error[0], re.S | re.M)[0]
+        m_message = error[1]
+
+        _Messages = allMessageDesc.messages.add()
+        try:
+            _Messages.code = int(m_type[1])
+        except ValueError:
+            if re.match('0x', m_type[1]):
+                _Messages.code = int(m_type[1], 16)
+            else:
+                raise ValueError
+        _Messages.message = "'%s'. %s" % (m_type[0], m_message)
+    print("End crawling errorMessage for nvidia CURAND API!\n")
+
+    #**************************************************************************************************#
+    #*********************************** CUDNN Error Message ******************************************#
+    cudnnStatus_t = {
+        "CUDNN_STATUS_SUCCESS": 0,
+        "CUDNN_STATUS_NOT_INITIALIZED": 1,
+        "CUDNN_STATUS_ALLOC_FAILED": 2,
+        "CUDNN_STATUS_BAD_PARAM": 3,
+        "CUDNN_STATUS_INTERNAL_ERROR": 4,
+        "CUDNN_STATUS_INVALID_VALUE": 5,
+        "CUDNN_STATUS_ARCH_MISMATCH": 6,
+        "CUDNN_STATUS_MAPPING_ERROR": 7,
+        "CUDNN_STATUS_EXECUTION_FAILED": 8,
+        "CUDNN_STATUS_NOT_SUPPORTED": 9,
+        "CUDNN_STATUS_LICENSE_ERROR": 10,
+        "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING": 11,
+        "CUDNN_STATUS_RUNTIME_IN_PROGRESS": 12,
+        "CUDNN_STATUS_RUNTIME_FP_OVERFLOW": 13,
+    }
+
+    print("start crawling errorMessage for nvidia CUDNN API--->")
+    url = 'https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnStatus_t'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUDNN
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+    f = open('1.txt', 'w')
+    f.write(html)
+
+    res_div = r'<div class="section" id="cudnnStatus_t__section_lmp_dgr_2jb"><a name="cudnnStatus_t__section_lmp_dgr_2jb" shape="rect">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<dt class="dt dlterm"><samp class="ph codeph">(.*?)</samp></dt>.*?<dd class="dd">(.*?)</dd>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+    for error in m_dt:
+        m_message = error[1]
+
+        res_class = r'<p class="p">.*?</p>'
+        res_class_detail = r'<p class="p">(.*?)</p>'
+        list_class = re.findall(res_class, m_message, re.S | re.M)
+        list_class_detail = re.findall(res_class_detail, m_message, re.S | re.M)
+        assert len(list_class) == len(list_class_detail)
+        for idx in range(len(list_class)):
+            m_message = m_message.replace(list_class[idx],
+                                          list_class_detail[idx])
+
+        res_a = r'(<a class="xref".*?</a>)'
+        res_shape = r'<a class="xref".*?>(.*?)</a>'
+        list_a = re.findall(res_a, m_message, re.S | re.M)
+        list_shape = re.findall(res_shape, m_message, re.S | re.M)
+        assert len(list_a) == len(list_shape)
+        for idx in range(len(list_a)):
+            m_message = m_message.replace(list_a[idx], list_shape[idx])
+
+        res_span = r'(<span class="ph">.*?</span>)'
+        res_span_detail = r'<span class="ph">(.*?)</span>'
+        list_span = re.findall(res_span, m_message, re.S | re.M)
+        list_span_detail = re.findall(res_span_detail, m_message, re.S | re.M)
+        assert len(list_span) == len(list_span_detail)
+        for idx in range(len(list_span)):
+            m_message = m_message.replace(list_span[idx], list_span_detail[idx])
+
+        res_samp = r'(<samp class="ph codeph">.*?</samp>)'
+        res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
+        list_samp = re.findall(res_samp, m_message, re.S | re.M)
+        list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
+        assert len(list_samp) == len(list_samp_detail)
+        for idx in range(len(list_samp)):
+            m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
+
+        m_message = re.sub(r'\n +', ' ', m_message)
+
+        _Messages = allMessageDesc.messages.add()
+        _Messages.code = int(cudnnStatus_t[error[0]])
+        _Messages.message = "'%s'. %s" % (error[0], m_message)
+    print("End crawling errorMessage for nvidia CUDNN API!\n")
+
+    #*************************************************************************************************#
+    #*********************************** CUBLAS Error Message ****************************************#
+    cublasStatus_t = {
+        "CUBLAS_STATUS_SUCCESS": 0,
+        "CUBLAS_STATUS_NOT_INITIALIZED": 1,
+        "CUBLAS_STATUS_ALLOC_FAILED": 3,
+        "CUBLAS_STATUS_INVALID_VALUE": 7,
+        "CUBLAS_STATUS_ARCH_MISMATCH": 8,
+        "CUBLAS_STATUS_MAPPING_ERROR": 11,
+        "CUBLAS_STATUS_EXECUTION_FAILED": 13,
+        "CUBLAS_STATUS_INTERNAL_ERROR": 14,
+        "CUBLAS_STATUS_NOT_SUPPORTED": 15,
+        "CUBLAS_STATUS_LICENSE_ERROR": 16
+    }
+
+    print("start crawling errorMessage for nvidia CUBLAS API--->")
+    url = 'https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUBLAS
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+
+    res_div = r'<p class="p">The type is used for function status returns. All cuBLAS library.*?<div class="tablenoborder">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<p class="p"><samp class="ph codeph">(.*?)</samp></p>.*?colspan="1">(.*?)</td>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+
+    for error in m_dt:
+        m_message = error[1]
+        m_message = re.sub(r'\n +', ' ', m_message)
+
+        res_p = r'<p class="p">.*?</p>'
+        res_p_detail = r'<p class="p">(.*?)</p>'
+        list_p = re.findall(res_p, m_message, re.S | re.M)
+        list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
+        assert len(list_p) == len(list_p_detail)
+        for idx in range(len(list_p)):
+            m_message = m_message.replace(list_p[idx], list_p_detail[idx])
+
+        res_samp = r'<samp class="ph codeph">.*?</samp>'
+        res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
+        list_samp = re.findall(res_samp, m_message, re.S | re.M)
+        list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
+        assert len(list_samp) == len(list_samp_detail)
+        for idx in range(len(list_samp)):
+            m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
+
+        _Messages = allMessageDesc.messages.add()
+        _Messages.code = int(cublasStatus_t[error[0]])
+        _Messages.message = "'%s'. %s" % (error[0], m_message)
+    print("End crawling errorMessage for nvidia CUBLAS API!\n")
+
+    #*************************************************************************************************#
+    #*********************************** CUSOLVER Error Message **************************************#
+    cusolverStatus_t = {
+        "CUSOLVER_STATUS_SUCCESS": 0,
+        "CUSOLVER_STATUS_NOT_INITIALIZED": 1,
+        "CUSOLVER_STATUS_ALLOC_FAILED": 2,
+        "CUSOLVER_STATUS_INVALID_VALUE": 3,
+        "CUSOLVER_STATUS_ARCH_MISMATCH": 4,
+        "CUSOLVER_STATUS_MAPPING_ERROR": 5,
+        "CUSOLVER_STATUS_EXECUTION_FAILED": 6,
+        "CUSOLVER_STATUS_INTERNAL_ERROR": 7,
+        "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED": 8,
+        "CUSOLVER_STATUS_NOT_SUPPORTED": 9,
+        "CUSOLVER_STATUS_ZERO_PIVOT": 10,
+        "CUSOLVER_STATUS_INVALID_LICENSE": 11,
+        "CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED": 12,
+        "CUSOLVER_STATUS_IRS_PARAMS_INVALID": 13,
+        "CUSOLVER_STATUS_IRS_INTERNAL_ERROR": 14,
+        "CUSOLVER_STATUS_IRS_NOT_SUPPORTED": 15,
+        "CUSOLVER_STATUS_IRS_OUT_OF_RANGE": 16,
+        "CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES": 17,
+        "CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED": 18
+    }
+    print("start crawling errorMessage for nvidia CUSOLVER API--->")
+    url = 'https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverSPstatus'
+
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.CUSOLVER
+
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+
+    res_div = r'This is a status type returned by the library functions and.*?<div class="tablenoborder">(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<samp class="ph codeph">(.*?)</samp></td>.*?colspan="1">(.*?)</td>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+
+    for error in m_dt:
+        m_message = error[1]
+        m_message = re.sub(r'\n +', '', m_message)
+        m_message = re.sub(r'<p class="p"></p>', '', m_message)
+
+        res_p = r'<p class="p">.*?</p>'
+        res_p_detail = r'<p class="p">(.*?)</p>'
+        list_p = re.findall(res_p, m_message, re.S | re.M)
+        list_p_detail = re.findall(res_p_detail, m_message, re.S | re.M)
+        assert len(list_p) == len(list_p_detail)
+        for idx in range(len(list_p)):
+            m_message = m_message.replace(list_p[idx], list_p_detail[idx])
+
+        res_samp = r'<samp class="ph codeph">.*?</samp>'
+        res_samp_detail = r'<samp class="ph codeph">(.*?)</samp>'
+        list_samp = re.findall(res_samp, m_message, re.S | re.M)
+        list_samp_detail = re.findall(res_samp_detail, m_message, re.S | re.M)
+        assert len(list_samp) == len(list_samp_detail)
+        for idx in range(len(list_samp)):
+            m_message = m_message.replace(list_samp[idx], list_samp_detail[idx])
+
+        res_strong = r'<strong class="ph b">.*?</strong>'
+        res_strong_detail = r'<strong class="ph b">(.*?)</strong>'
+        list_strong = re.findall(res_strong, m_message, re.S | re.M)
+        list_strong_detail = re.findall(res_strong_detail, m_message, re.S |
+                                        re.M)
+        assert len(list_strong) == len(list_strong_detail)
+        for idx in range(len(list_strong)):
+            m_message = m_message.replace(list_strong[idx],
+                                          list_strong_detail[idx])
+
+        _Messages = allMessageDesc.messages.add()
+        _Messages.code = int(cusolverStatus_t[error[0]])
+        _Messages.message = "'%s'. %s" % (error[0], m_message)
+    print("End crawling errorMessage for nvidia CUSOLVER API!\n")
+
+    #**********************************************************************************************#
+    #*************************************** NCCL error *******************************************#
+    print("start crawling errorMessage for nvidia NCCL API--->")
+    url = 'https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/types.html#ncclresult-t'
+    allMessageDesc = externalErrorDesc.errors.add()
+    allMessageDesc.type = external_error_pb2.NCCL
+    html = urllib.request.urlopen(url).read().decode('utf-8')
+    res_div = r'<code class="descname">ncclResult_t</code>(.*?)</div>'
+    m_div = re.findall(res_div, html, re.S | re.M)[0]
+
+    res_dt = r'<code class="descname">(.*?)</code>.*?<span class="pre">(.*?)</span></code>\)(.*?)</p>\n</dd></dl>'
+    m_dt = re.findall(res_dt, m_div, re.S | re.M)
+    for error in m_dt:
+        m_message = re.sub(r'\n', '', error[2])
+        _Messages = allMessageDesc.messages.add()
+        _Messages.code = int(error[1])
+        _Messages.message = "'%s'. %s" % (error[0], m_message)
+    print("End crawling errorMessage for nvidia NCCL API!\n")
+
+
+def main(argv):
+    try:
+        opts, _ = getopt.getopt(argv, "h", ["help"])
+    except getopt.GetoptError:
+        print('python spider.py')
+        sys.exit(2)
+    for opt, _ in opts:
+        if opt in ("-h", "--help"):
+            print('python spider.py')
+            sys.exit(2)
+    externalErrorDesc = external_error_pb2.ExternalErrorDesc()
+    parsing(externalErrorDesc)
+
+    serializedString = externalErrorDesc.SerializeToString()
+    with open("externalErrorMsg.pb", "wb") as f:
+        # save for externalErrorMsg.pb from Python-protobuf interface
+        # load from C++-protobuf interface and get error message
+        f.write(serializedString)
+    print(
+        "Generating data file [externalErrorMsg.pb] for external third_party API error has been done!"
+    )
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/tools/cudaError/start.sh b/tools/externalError/start.sh
similarity index 59%
rename from tools/cudaError/start.sh
rename to tools/externalError/start.sh
index 66e56b8485d8c6d40937bf821c1889424da33527..32ef63c261268191646b03648ca46fbc15485087 100644
--- a/tools/cudaError/start.sh
+++ b/tools/externalError/start.sh
@@ -29,19 +29,7 @@ else
     echo "please run on Mac/Linux"
     exit 1
 fi
-protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/cuda_error.proto
+protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto
 
-version=90,100,-1    # -1 represent the latest cuda-version 
-url=https://docs.nvidia.com/cuda/archive/9.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/archive/10.0/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038,https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
-
-if [ "$1" != "" ]; then
-    version=$version,$(($1*10))
-    if [ "$2" != "" ]; then
-        url=$url,$2
-    else
-        url=$url,https://docs.nvidia.com/cuda/archive/$1/cuda-runtime-api/group__CUDART__TYPES.html#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038
-    fi
-fi
-
-python spider.py --version=$version --url=$url
-tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
+python3.7 spider.py
+tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 0df3b4914f5df972aab26768bc4d18b1ffc00163..78d9978c4bc45e0917c6de71a8220ca62959f028 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -124,18 +124,34 @@ class PRChecker(object):
     def get_pr_files(self):
         """ Get files in pull request. """
         page = 0
-        file_list = []
+        file_dict = {}
         while True:
             files = self.pr.get_files().get_page(page)
             if not files:
                 break
             for f in files:
-                if f.status == 'removed':
-                    file_list.append('removed')
-                else:
-                    file_list.append(PADDLE_ROOT + f.filename)
+                file_dict[PADDLE_ROOT + f.filename] = f.status
             page += 1
-        return file_list
+        print("pr modify files: %s" % file_dict)
+        return file_dict
+
+    def get_is_white_file(self, filename):
+        """ judge is white file in pr's files. """
+        isWhiteFile = False
+        not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/',
+                           PADDLE_ROOT + 'tools/dockerfile/',
+                           PADDLE_ROOT + 'tools/windows/',
+                           PADDLE_ROOT + 'tools/test_runner.py',
+                           PADDLE_ROOT + 'tools/parallel_UT_rule.py',
+                           PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
+                           PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')
+        if 'cmakelist' in filename.lower():
+            isWhiteFile = False
+        elif filename.startswith((not_white_files)):
+            isWhiteFile = False
+        else:
+            isWhiteFile = True
+        return isWhiteFile
 
     def __get_comment_by_filetype(self, content, filetype):
         result = []
@@ -232,13 +248,15 @@ class PRChecker(object):
         return True
 
     def get_all_count(self):
-        os.system(
-            "cd %sbuild && ctest -N|grep 'Total Tests:' | awk -F ': ' '{print $2}' > testCount"
-            % PADDLE_ROOT)
-        f = open("%sbuild/testCount" % PADDLE_ROOT)
-        testCount = f.read()
-        f.close()
-        return int(testCount.strip())
+        p = subprocess.Popen(
+            "cd {}build && ctest -N".format(PADDLE_ROOT),
+            shell=True,
+            stdout=subprocess.PIPE)
+        out, err = p.communicate()
+        for line in out.splitlines():
+            if 'Total Tests:' in str(line):
+                all_counts = line.split()[-1]
+        return int(all_counts)
 
     def get_pr_ut(self):
         """ Get unit tests in pull request. """
@@ -247,24 +265,58 @@ class PRChecker(object):
         check_added_ut = False
         ut_list = []
         file_ut_map = None
+
         ret = self.__urlretrieve(
             'https://paddle-docker-tar.bj.bcebos.com/pre_test/ut_file_map.json',
             'ut_file_map.json')
         if not ret:
             print('PREC download file_ut.json failed')
             exit(1)
+
         with open('ut_file_map.json') as jsonfile:
             file_ut_map = json.load(jsonfile)
 
         current_system = platform.system()
         notHitMapFiles = []
-        hitMapFiles = []
+        hitMapFiles = {}
         onlyCommentsFilesOrXpu = []
-        file_list = self.get_pr_files()
-        if 'removed' in file_list:
-            print("ipipe_log_param_PRECISION_TEST: false")
-            print("notHitMapFiles: [rm file]")
-            return ''
+        filterFiles = []
+        file_list = []
+        file_dict = self.get_pr_files()
+        for filename in file_dict:
+            if filename.startswith(
+                (PADDLE_ROOT + 'python/', PADDLE_ROOT + 'paddle/fluid/')):
+                file_list.append(filename)
+            else:
+                if file_dict[filename] == 'added':
+                    file_list.append(filename)
+                else:
+                    isWhiteFile = self.get_is_white_file(filename)
+                    if isWhiteFile == False:
+                        file_list.append(filename)
+                    else:
+                        filterFiles.append(filename)
+        if len(file_list) == 0:
+            ut_list.append('filterfiles_placeholder')
+            ret = self.__urlretrieve(
+                'https://paddle-docker-tar.bj.bcebos.com/pre_test/prec_delta',
+                'prec_delta')
+            if ret:
+                with open('prec_delta') as delta:
+                    for ut in delta:
+                        ut_list.append(ut.rstrip('\r\n'))
+            else:
+                print('PREC download prec_delta failed')
+                exit(1)
+            PRECISION_TEST_Cases_ratio = format(
+                float(len(ut_list)) / float(self.get_all_count()), '.2f')
+            print("filterFiles: %s" % filterFiles)
+            print("ipipe_log_param_PRECISION_TEST: true")
+            print("ipipe_log_param_PRECISION_TEST_Cases_count: %s" %
+                  len(ut_list))
+            print("ipipe_log_param_PRECISION_TEST_Cases_ratio: %s" %
+                  PRECISION_TEST_Cases_ratio)
+            return '\n'.join(ut_list)
         else:
             for f in file_list:
                 if current_system == "Darwin" or current_system == "Windows" or self.suffix == ".py3":
@@ -283,24 +335,36 @@ class PRChecker(object):
                         if f_judge.find('test_') != -1 or f_judge.find(
                                 '_test') != -1:
                             check_added_ut = True
+                        if file_dict[f] not in ['removed']:
+                            if self.is_only_comment(f):
+                                ut_list.append('comment_placeholder')
+                                onlyCommentsFilesOrXpu.append(f_judge)
+                            else:
+                                notHitMapFiles.append(f_judge)
+                        else:
+                            print("remove file not hit mapFiles: %s" % f_judge)
+                    else:
+                        notHitMapFiles.append(f_judge) if file_dict[
+                            f] != 'removed' else print(
+                                "remove file not hit mapFiles: %s" % f_judge)
+                else:
+                    if file_dict[f] not in ['removed']:
                         if self.is_only_comment(f):
                             ut_list.append('comment_placeholder')
                             onlyCommentsFilesOrXpu.append(f_judge)
                         else:
-                            notHitMapFiles.append(f_judge)
+                            hitMapFiles[f_judge] = len(file_ut_map[f_judge])
+                            ut_list.extend(file_ut_map.get(f_judge))
                     else:
-                        notHitMapFiles.append(f_judge)
-                else:
-                    if self.is_only_comment(f):
-                        ut_list.append('comment_placeholder')
-                        onlyCommentsFilesOrXpu.append(f_judge)
-                    else:
-                        hitMapFiles.append(f_judge)
+                        hitMapFiles[f_judge] = len(file_ut_map[f_judge])
                         ut_list.extend(file_ut_map.get(f_judge))
+
             ut_list = list(set(ut_list))
             if len(notHitMapFiles) != 0:
                 print("ipipe_log_param_PRECISION_TEST: false")
                 print("notHitMapFiles: %s" % notHitMapFiles)
+                if len(filterFiles) != 0:
+                    print("filterFiles: %s" % filterFiles)
                 return ''
             else:
                 if check_added_ut:
@@ -318,6 +382,7 @@ class PRChecker(object):
                     else:
                         print('PREC download prec_delta failed')
                         exit(1)
+                    print("hitMapFiles: %s" % hitMapFiles)
                     print("ipipe_log_param_PRECISION_TEST: true")
                     print("ipipe_log_param_PRECISION_TEST_Cases_count: %s" %
                           len(ut_list))
@@ -326,6 +391,8 @@ class PRChecker(object):
                         '.2f')
                     print("ipipe_log_param_PRECISION_TEST_Cases_ratio: %s" %
                           PRECISION_TEST_Cases_ratio)
+                    if len(filterFiles) != 0:
+                        print("filterFiles: %s" % filterFiles)
                 return '\n'.join(ut_list)
 
 
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 55b82084f6bc5acf5e5a0f267255d6c63b0ff4c4..70d7fb98cb5387d228805119e2c6a07347e5d45d 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -791,7 +791,6 @@ TWO_PARALLEL_JOB = [
     'test_callbacks',
     'test_sigmoid_focal_loss_op',
     'test_collect_fpn_proposals_op',
-    'test_sgd_op',
     'test_sequence_unpad_op',
     'test_conv1d_transpose_layer',
     'test_sequence_slice_op',
@@ -852,7 +851,6 @@ TWO_PARALLEL_JOB = [
     'test_learning_rate_scheduler',
     'test_linspace',
     'test_linear_interp_op',
-    'test_layer_norm_op_v2',
     'test_lamb_op',
     'test_lookup_table_v2_op',
     'test_l1_norm_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 2c50c4bf9f6207d3eea5f917d68bdd218cb07aeb..d1e4680e63f95ed74246cdc63432d84a49d0342a 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -710,4 +710,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_lamb_op_xpu',
     'test_model_cast_to_bf16',
     'test_sgd_op_bf16',
+    'test_marker_op',
+    'test_c_embedding_op',
 ]