Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into all_new_design_exec

3d769b97 · phlrain · 1b7fe5e8 · cb12282e · 3d769b97 · 3d769b97
428 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -284,6 +284,27 @@ if(WITH_GPU)
    endif()
 endif()
+if(WITH_ROCM)
+    include(hip)
+    include(miopen) # set miopen libraries, must before configure
+endif(WITH_ROCM)
+if (NOT WITH_ROCM AND WITH_RCCL)
+    MESSAGE(WARNING
+        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+    set(WITH_RCCL OFF CACHE STRING
+        "Disable RCCL when compiling without ROCM" FORCE)
+endif()
+if(WITH_RCCL)
+     add_definitions("-DPADDLE_WITH_RCCL")
+     include(rccl)
+else()
+     if(WITH_ROCM)
+         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
+     endif()
+endif()
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 include(flags)              # set paddle compile flags
@@ -308,26 +329,6 @@ include(configure)          # add paddle env configuration
 include_directories("${PADDLE_SOURCE_DIR}")
-if(WITH_ROCM)
-    include(hip)
-endif(WITH_ROCM)
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
-endif()
-if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
-else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
-endif()
 if(WITH_NV_JETSON)
    set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -143,6 +143,14 @@ elseif(WITH_ROCM)
    add_definitions(-DPADDLE_WITH_HIP)
    add_definitions(-DEIGEN_USE_GPU)
    add_definitions(-DEIGEN_USE_HIP)
+    if(NOT MIOPEN_FOUND)
+        message(FATAL_ERROR "Paddle needs MIOpen to compile")
+    endif()
+    if(${MIOPEN_VERSION} VERSION_LESS 2090)
+        message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
+    endif()
 else()
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable)
  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
    set(cuda_arch_bin "30 35")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
-    set(cuda_arch_bin "50")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "53")
+    else()
+      set(cuda_arch_bin "50")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
-    set(cuda_arch_bin "60 61")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "62")
+    else()
+      set(cuda_arch_bin "60 61")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
-    set(cuda_arch_bin "70")
+    if (WITH_NV_JETSON)
+      set(cuda_arch_bin "72")
+    else()
+      set(cuda_arch_bin "70")
+    endif()
  elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
    set(cuda_arch_bin "75")
  elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -46,6 +46,7 @@ ExternalProject_Add(
    ${BOOST_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    "${BOOST_DOWNLOAD_CMD}"
+    URL_MD5               f891e8c2c9424f0565f0129ad9ab4aff
    PREFIX                ${BOOST_PREFIX_DIR}
    DOWNLOAD_DIR          ${BOOST_SOURCE_DIR}
    SOURCE_DIR            ${BOOST_SOURCE_DIR}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -110,7 +110,7 @@ if(WIN32)
    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
        COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
-        COMMAND for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def)
+        COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on)
    add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM
        COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64)
 else(WIN32)

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -24,6 +24,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
 IF(WIN32)
    SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE)
    SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE)
+    SET(MKLML_URL_MD5             ff8c5237570f03eea37377ccfc95a08a)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/mklml.lib)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5md.lib)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/mklml.dll)
@@ -33,6 +34,7 @@ ELSE()
    #  Now enable csrmm function in mklml library temporarily, it will be updated as offical version later.
    SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE)
    SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+    SET(MKLML_URL_MD5             bc6a7faea6a2a9ad31752386f3ae87da)
    SET(MKLML_LIB                 ${MKLML_LIB_DIR}/libmklml_intel.so)
    SET(MKLML_IOMP_LIB            ${MKLML_LIB_DIR}/libiomp5.so)
    SET(MKLML_SHARED_LIB          ${MKLML_LIB_DIR}/libmklml_intel.so)
@@ -52,6 +54,7 @@ ExternalProject_Add(
    ${MKLML_PROJECT}
    ${EXTERNAL_PROJECT_LOG_ARGS}
    "${MKLML_DOWNLOAD_CMD}"
+    URL_MD5               ${MKLML_URL_MD5}
    PREFIX                ${MKLML_PREFIX_DIR}
    DOWNLOAD_DIR          ${MKLML_SOURCE_DIR}
    SOURCE_DIR            ${MKLML_SOURCE_DIR}

--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+INCLUDE(ExternalProject)
+SET(ROCKSDB_SOURCES_DIR ${THIRD_PARTY_PATH}/rocksdb)
+SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb)
+SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE)
+SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE)
+SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR})
+ExternalProject_Add(
+    extern_rocksdb
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${ROCKSDB_SOURCES_DIR}
+    GIT_REPOSITORY "https://github.com/facebook/rocksdb"
+    GIT_TAG v6.10.1
+    UPDATE_COMMAND ""
+    CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+               -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+               -DWITH_BZ2=OFF
+               -DWITH_GFLAGS=OFF
+               -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
+               -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+#    BUILD_BYPRODUCTS ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a
+    INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ 
+        && cp ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES}
+        && cp -r ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/
+    BUILD_IN_SOURCE 1
+)
+ADD_DEPENDENCIES(extern_rocksdb snappy)
+ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES})
+ADD_DEPENDENCIES(rocksdb extern_rocksdb)
+LIST(APPEND external_project_dependencies rocksdb)
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -24,7 +24,7 @@ SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 # in case of low internet speed  
 #set(WARPCTC_REPOSITORY  https://gitee.com/tianjianhe/warp-ctc.git)
 set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
-set(WARPCTC_TAG         c690fc5755abbdbdc98ef78d51ec10a6748a8cd1)
+set(WARPCTC_TAG         37ece0e1bbe8a0019a63ac7e6462c36591c66a5b)
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
    CACHE PATH "Warp-ctc Directory" FORCE)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -146,12 +146,12 @@ copy(inference_lib_dist
        SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h
        DSTS ${dst_dir})
-# Only GPU need cudaErrorMessage.pb
+# GPU must copy externalErrorMsg.pb
 IF(WITH_GPU)
-        set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data")
+    set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data")
-        copy(inference_lib_dist
+    copy(inference_lib_dist
-                SRCS ${cudaerror_INCLUDE_DIR}
+            SRCS ${externalError_INCLUDE_DIR}
-                DSTS ${dst_dir})
+            DSTS ${dst_dir})
 ENDIF()
 # CMakeCache Info
@@ -193,10 +193,7 @@ copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/*
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h
-        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
-copy(inference_lib_dist
-        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h
        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
 copy(inference_lib_dist
        SRCS  ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h
@@ -259,7 +256,7 @@ copy(fluid_lib_dist
 set(module "platform")
 set(platform_lib_deps profiler_proto error_codes_proto)
 if(WITH_GPU)
-  set(platform_lib_deps ${platform_lib_deps} cuda_error_proto)
+  set(platform_lib_deps ${platform_lib_deps} external_error_proto)
 endif(WITH_GPU)
 add_dependencies(fluid_lib_dist ${platform_lib_deps})
@@ -323,12 +320,18 @@ function(version version_file)
            "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
            "WITH_MKL: ${WITH_MKL}\n"
            "WITH_MKLDNN: ${WITH_MKLDNN}\n"
-            "WITH_GPU: ${WITH_GPU}\n")
+            "WITH_GPU: ${WITH_GPU}\n"
+            "WITH_ROCM: ${WITH_ROCM}\n")
    if(WITH_GPU)
        file(APPEND ${version_file}
                "CUDA version: ${CUDA_VERSION}\n"
                "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n")
    endif()
+    if(WITH_ROCM)
+        file(APPEND ${version_file}
+                "HIP version: ${HIP_VERSION}\n"
+                "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n")
+    endif()
    file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
    if(TENSORRT_FOUND)
        file(APPEND ${version_file}

--- a/cmake/miopen.cmake
+++ b/cmake/miopen.cmake
+if(NOT WITH_ROCM)
+    return()
+endif()
+# Now we don't support ROCm on windows
+if(WIN32)
+    return()
+endif()
+set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT")
+find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include
+          NO_DEFAULT_PATH
+)
+get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH)
+find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so"
+    PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} 
+          $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 
+          NO_DEFAULT_PATH 
+    DOC "Path to MIOpen library.")
+if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY)
+    set(MIOPEN_FOUND ON)
+else()
+    set(MIOPEN_FOUND OFF)
+endif()
+macro(find_miopen_version miopen_header_file) 
+    file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS)
+    get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY)
+    string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1"
+        MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1"
+        MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1"
+        MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}")
+    string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION
+        "${MIOPEN_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1"
+        MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}")
+    if(NOT MIOPEN_MAJOR_VERSION)
+        set(MIOPEN_VERSION "???")
+    else()
+        add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"")
+        math(EXPR MIOPEN_VERSION
+            "${MIOPEN_MAJOR_VERSION} * 1000 +
+             ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}")
+        message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h "
+          "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ")
+    endif()
+endmacro()
+if(MIOPEN_FOUND)
+  find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) 
+endif()
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -108,13 +108,19 @@ ENDMACRO()
 # 2. NAME:          The name of file, that determin the dirname
 #
 FUNCTION(file_download_and_uncompress URL NAME)
-  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}")
+  set(options "")
+  set(oneValueArgs MD5)
+  set(multiValueArgs "")
+  cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}")
  SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE)
  ExternalProject_Add(
-      extern_download_${NAME}
+      download_${NAME}
      ${EXTERNAL_PROJECT_LOG_ARGS}
      PREFIX                ${THIRD_PARTY_PATH}/${NAME}
      URL                   ${URL}
+      URL_MD5               ${URL_MD5}
+      TIMEOUT               120
      DOWNLOAD_DIR          ${THIRD_PARTY_PATH}/${NAME}/data/
      SOURCE_DIR            ${THIRD_PARTY_PATH}/${NAME}/data/
      DOWNLOAD_NO_PROGRESS  1
@@ -123,7 +129,7 @@ FUNCTION(file_download_and_uncompress URL NAME)
      UPDATE_COMMAND        ""
      INSTALL_COMMAND       ""
    )
-  set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE)
+  set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE)
 ENDFUNCTION()
@@ -242,8 +248,20 @@ if(WITH_GPU)
        include(external/cub)       # download cub
        list(APPEND third_party_deps extern_cub)
    endif()
-    set(CUDAERROR_URL  "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE)
+    set(URL  "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE)
-    file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
+    file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4)   # download file externalErrorMsg.tar.gz
+    if(WITH_TESTING)
+        # copy externalErrorMsg.pb for unittest 'enforce_test'
+        set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data)
+        if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja"))
+            set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data)
+        else()
+            set(DST_DIR ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data)
+        endif()
+        add_custom_command(TARGET download_externalError POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR}
+            COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}")
+    endif()
 endif(WITH_GPU)
 if(WITH_XPU)
@@ -304,6 +322,11 @@ if (WITH_PSCORE)
    include(external/libmct)     # download, build, install libmct
    list(APPEND third_party_deps extern_libmct)
+    if (WITH_HETERPS)
+        include(external/rocksdb)     # download, build, install libmct
+        list(APPEND third_party_deps extern_rocksdb)
+    endif()
 endif()
 if(WITH_XBYAK)

--- a/paddle/fluid/distributed/fleet.cc
+++ b/paddle/fluid/distributed/fleet.cc
@@ -417,8 +417,10 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync(
  return;
 }
-void FleetWrapper::LoadModel(const std::string& path, const int mode) {
+void FleetWrapper::LoadModel(const std::string& path, const std::string& mode) {
-  auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
+  auto* communicator = Communicator::GetInstance();
+  auto ret = communicator->_worker_ptr->load(path, mode);
+  // auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode));
  ret.wait();
  if (ret.get() != 0) {
    LOG(ERROR) << "load model from path:" << path << " failed";
@@ -429,8 +431,11 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) {
 void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
                                     const std::string& path, const int mode) {
+  auto* communicator = Communicator::GetInstance();
  auto ret =
-      pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
+      communicator->_worker_ptr->load(table_id, path, std::to_string(mode));
+  // auto ret =
+  //    pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
  ret.wait();
  if (ret.get() != 0) {
    LOG(ERROR) << "load model of table id: " << table_id

--- a/paddle/fluid/distributed/fleet.h
+++ b/paddle/fluid/distributed/fleet.h
@@ -200,7 +200,7 @@ class FleetWrapper {
  void PrintTableStat(const uint64_t table_id);
  // mode = 0, load all feature
  // mode = 1, load delta feature, which means load diff
-  void LoadModel(const std::string& path, const int mode);
+  void LoadModel(const std::string& path, const std::string& mode);
  // mode = 0, load all feature
  // mode = 1, load delta feature, which means load diff
  void LoadModelOneTable(const uint64_t table_id, const std::string& path,

--- a/paddle/fluid/distributed/service/ps_local_client.cc
+++ b/paddle/fluid/distributed/service/ps_local_client.cc
@@ -42,17 +42,17 @@ int32_t PsLocalClient::initialize() {
 ::std::future<int32_t> PsLocalClient::load(const std::string& epoch,
                                           const std::string& mode) {
  // TODO
-  // for (auto& it : _table_map) {
+  for (auto& it : _table_map) {
-  //    load(it.first, epoch, mode);
+    load(it.first, epoch, mode);
-  //}
+  }
  return done();
 }
 ::std::future<int32_t> PsLocalClient::load(uint32_t table_id,
                                           const std::string& epoch,
                                           const std::string& mode) {
  // TODO
-  // auto* table_ptr = table(table_id);
+  auto* table_ptr = table(table_id);
-  // table_ptr->load(epoch, mode);
+  table_ptr->load(epoch, mode);
  return done();
 }
@@ -245,7 +245,6 @@ int32_t PsLocalClient::initialize() {
 ::std::future<int32_t> PsLocalClient::push_sparse_raw_gradient(
    size_t table_id, const uint64_t* keys, const float** update_values,
    size_t num, void* callback) {
-  VLOG(1) << "wxx push_sparse_raw_gradient";
  PSClientClosure* closure = reinterpret_cast<PSClientClosure*>(callback);
  auto* accessor = table_accessor(table_id);
  auto* table_ptr = table(table_id);

--- a/paddle/fluid/distributed/service/ps_local_server.h
+++ b/paddle/fluid/distributed/service/ps_local_server.h
@@ -26,9 +26,14 @@ class PsLocalServer : public PSServer {
  PsLocalServer() {}
  virtual ~PsLocalServer() {}
  virtual uint64_t start() { return 0; }
-  virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; }
+  virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; }
  virtual int32_t stop() { return 0; }
  virtual int32_t port() { return 0; }
+  virtual int32_t configure(
+      const PSParameter &config, PSEnvironment &env, size_t server_rank,
+      const std::vector<framework::ProgramDesc> &server_sub_program = {}) {
+    return 0;
+  }
 private:
  virtual int32_t initialize() { return 0; }

--- a/paddle/fluid/distributed/service/server.h
+++ b/paddle/fluid/distributed/service/server.h
@@ -70,7 +70,7 @@ class PSServer {
  virtual int32_t configure(
      const PSParameter &config, PSEnvironment &env, size_t server_rank,
-      const std::vector<framework::ProgramDesc> &server_sub_program = {}) final;
+      const std::vector<framework::ProgramDesc> &server_sub_program = {});
  // return server_ip
  virtual std::string ip() { return butil::my_ip_cstr(); }

--- a/paddle/fluid/distributed/table/CMakeLists.txt
+++ b/paddle/fluid/distributed/table/CMakeLists.txt
@@ -9,15 +9,24 @@ set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS $
 cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler)
 set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS)
-cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc
+set(EXTERN_DEP "")
-sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS}
+if(WITH_HETERPS)
-${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator)
+    set(TABLE_SRC common_sparse_table.cc ssd_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
+    set(EXTERN_DEP rocksdb)
+else()
+    set(TABLE_SRC common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc)
+endif()
+cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS}
+${RPC_DEPS} graph_edge graph_node device_context string_helper
+simple_threadpool xxhash generator ${EXTERN_DEP})
 set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

--- a/paddle/fluid/distributed/table/common_sparse_table.cc
+++ b/paddle/fluid/distributed/table/common_sparse_table.cc
@@ -25,83 +25,12 @@ class ValueBlock;
 }  // namespace distributed
 }  // namespace paddle
-#define PSERVER_SAVE_SUFFIX ".shard"
-using boost::lexical_cast;
 namespace paddle {
 namespace distributed {
-enum SaveMode { all, base, delta };
+void CommonSparseTable::ProcessALine(const std::vector<std::string>& columns,
+                                     const Meta& meta, const int64_t id,
-struct Meta {
+                                     std::vector<std::vector<float>>* values) {
-  std::string param;
-  int shard_id;
-  std::vector<std::string> names;
-  std::vector<int> dims;
-  uint64_t count;
-  std::unordered_map<std::string, int> dims_map;
-  explicit Meta(const std::string& metapath) {
-    std::ifstream file(metapath);
-    std::string line;
-    int num_lines = 0;
-    while (std::getline(file, line)) {
-      if (StartWith(line, "#")) {
-        continue;
-      }
-      auto pairs = paddle::string::split_string<std::string>(line, "=");
-      PADDLE_ENFORCE_EQ(
-          pairs.size(), 2,
-          paddle::platform::errors::InvalidArgument(
-              "info in %s except k=v, but got %s", metapath, line));
-      if (pairs[0] == "param") {
-        param = pairs[1];
-      }
-      if (pairs[0] == "shard_id") {
-        shard_id = std::stoi(pairs[1]);
-      }
-      if (pairs[0] == "row_names") {
-        names = paddle::string::split_string<std::string>(pairs[1], ",");
-      }
-      if (pairs[0] == "row_dims") {
-        auto dims_strs =
-            paddle::string::split_string<std::string>(pairs[1], ",");
-        for (auto& str : dims_strs) {
-          dims.push_back(std::stoi(str));
-        }
-      }
-      if (pairs[0] == "count") {
-        count = std::stoull(pairs[1]);
-      }
-    }
-    for (int x = 0; x < names.size(); ++x) {
-      dims_map[names[x]] = dims[x];
-    }
-  }
-  Meta(std::string param, int shard_id, std::vector<std::string> row_names,
-       std::vector<int> dims, uint64_t count) {
-    this->param = param;
-    this->shard_id = shard_id;
-    this->names = row_names;
-    this->dims = dims;
-    this->count = count;
-  }
-  std::string ToString() {
-    std::stringstream ss;
-    ss << "param=" << param << "\n";
-    ss << "shard_id=" << shard_id << "\n";
-    ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
-    ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
-    ss << "count=" << count << "\n";
-    return ss.str();
-  }
-};
-void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
-                  const int64_t id, std::vector<std::vector<float>>* values) {
  auto colunmn_size = columns.size();
  auto load_values =
      paddle::string::split_string<std::string>(columns[colunmn_size - 1], ",");
@@ -134,8 +63,10 @@ void ProcessALine(const std::vector<std::string>& columns, const Meta& meta,
  }
 }
-void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+void CommonSparseTable::SaveMetaToText(std::ostream* os,
-                    const size_t shard_idx, const int64_t total) {
+                                       const CommonAccessorParameter& common,
+                                       const size_t shard_idx,
+                                       const int64_t total) {
  // save meta
  std::stringstream stream;
  stream << "param=" << common.table_name() << "\n";
@@ -148,8 +79,10 @@ void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
  os->write(stream.str().c_str(), sizeof(char) * stream.str().size());
 }
-int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+int64_t CommonSparseTable::SaveValueToText(std::ostream* os,
-                        std::shared_ptr<::ThreadPool> pool, const int mode) {
+                                           std::shared_ptr<ValueBlock> block,
+                                           std::shared_ptr<::ThreadPool> pool,
+                                           const int mode, int shard_id) {
  int64_t save_num = 0;
  for (auto& table : block->values_) {
    for (auto& value : table) {
@@ -186,10 +119,10 @@ int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
  return save_num;
 }
-int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
+int64_t CommonSparseTable::LoadFromText(
-                     const int pserver_id, const int pserver_num,
+    const std::string& valuepath, const std::string& metapath,
-                     const int local_shard_num,
+    const int pserver_id, const int pserver_num, const int local_shard_num,
-                     std::vector<std::shared_ptr<ValueBlock>>* blocks) {
+    std::vector<std::shared_ptr<ValueBlock>>* blocks) {
  Meta meta = Meta(metapath);
  int num_lines = 0;
@@ -198,7 +131,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath,
  while (std::getline(file, line)) {
    auto values = paddle::string::split_string<std::string>(line, "\t");
-    auto id = lexical_cast<int64_t>(values[0]);
+    auto id = lexical_cast<uint64_t>(values[0]);
    if (id % pserver_num != pserver_id) {
      VLOG(3) << "will not load " << values[0] << " from " << valuepath
@@ -388,8 +321,9 @@ int32_t CommonSparseTable::save(const std::string& dirname,
  int64_t total_ins = 0;
  for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) {
    // save values
-    auto shard_save_num = SaveValueToText(vs.get(), shard_values_[shard_id],
+    auto shard_save_num =
-                                          _shards_task_pool[shard_id], mode);
+        SaveValueToText(vs.get(), shard_values_[shard_id],
+                        _shards_task_pool[shard_id], mode, shard_id);
    total_ins += shard_save_num;
  }
  vs->close();

--- a/paddle/fluid/distributed/table/common_sparse_table.h
+++ b/paddle/fluid/distributed/table/common_sparse_table.h
@@ -32,11 +32,83 @@
 #include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/string/string_helper.h"
+#define PSERVER_SAVE_SUFFIX ".shard"
+using boost::lexical_cast;
 namespace paddle {
 namespace distributed {
 class SparseOptimizer;
+enum SaveMode { all, base, delta };
+struct Meta {
+  std::string param;
+  int shard_id;
+  std::vector<std::string> names;
+  std::vector<int> dims;
+  uint64_t count;
+  std::unordered_map<std::string, int> dims_map;
+  explicit Meta(const std::string& metapath) {
+    std::ifstream file(metapath);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      if (StartWith(line, "#")) {
+        continue;
+      }
+      auto pairs = paddle::string::split_string<std::string>(line, "=");
+      PADDLE_ENFORCE_EQ(
+          pairs.size(), 2,
+          paddle::platform::errors::InvalidArgument(
+              "info in %s except k=v, but got %s", metapath, line));
+      if (pairs[0] == "param") {
+        param = pairs[1];
+      }
+      if (pairs[0] == "shard_id") {
+        shard_id = std::stoi(pairs[1]);
+      }
+      if (pairs[0] == "row_names") {
+        names = paddle::string::split_string<std::string>(pairs[1], ",");
+      }
+      if (pairs[0] == "row_dims") {
+        auto dims_strs =
+            paddle::string::split_string<std::string>(pairs[1], ",");
+        for (auto& str : dims_strs) {
+          dims.push_back(std::stoi(str));
+        }
+      }
+      if (pairs[0] == "count") {
+        count = std::stoull(pairs[1]);
+      }
+    }
+    for (int x = 0; x < names.size(); ++x) {
+      dims_map[names[x]] = dims[x];
+    }
+  }
+  Meta(std::string param, int shard_id, std::vector<std::string> row_names,
+       std::vector<int> dims, uint64_t count) {
+    this->param = param;
+    this->shard_id = shard_id;
+    this->names = row_names;
+    this->dims = dims;
+    this->count = count;
+  }
+  std::string ToString() {
+    std::stringstream ss;
+    ss << "param=" << param << "\n";
+    ss << "shard_id=" << shard_id << "\n";
+    ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n";
+    ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n";
+    ss << "count=" << count << "\n";
+    return ss.str();
+  }
+};
 class CommonSparseTable : public SparseTable {
 public:
  CommonSparseTable() { rwlock_.reset(new framework::RWLock); }
@@ -56,9 +128,25 @@ class CommonSparseTable : public SparseTable {
  virtual int32_t initialize_optimizer();
  virtual int32_t initialize_recorder();
-  int32_t load(const std::string& path, const std::string& param);
+  virtual int32_t load(const std::string& path, const std::string& param);
+  virtual int32_t save(const std::string& path, const std::string& param);
+  void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                      const size_t shard_idx, const int64_t total);
-  int32_t save(const std::string& path, const std::string& param);
+  int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                          std::shared_ptr<::ThreadPool> pool, const int mode,
+                          int shard_id);
+  virtual void ProcessALine(const std::vector<std::string>& columns,
+                            const Meta& meta, const int64_t id,
+                            std::vector<std::vector<float>>* values);
+  virtual int64_t LoadFromText(
+      const std::string& valuepath, const std::string& metapath,
+      const int pserver_id, const int pserver_num, const int local_shard_num,
+      std::vector<std::shared_ptr<ValueBlock>>* blocks);
  virtual std::pair<int64_t, int64_t> print_table_stat();
  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
@@ -89,7 +177,7 @@ class CommonSparseTable : public SparseTable {
  virtual int32_t _push_sparse(const uint64_t* keys, const float** values,
                               size_t num);
- private:
+ protected:
  const int task_pool_size_ = 11;
  std::vector<std::shared_ptr<::ThreadPool>> _shards_task_pool;

--- a/paddle/fluid/distributed/table/depends/large_scale_kv.h
+++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h
@@ -83,6 +83,7 @@ inline bool probility_entry(VALUE *value, float threshold) {
 class ValueBlock {
 public:
+  typedef typename robin_hood::unordered_map<uint64_t, VALUE *> map_type;
  explicit ValueBlock(const std::vector<std::string> &value_names,
                      const std::vector<int> &value_dims,
                      const std::vector<int> &value_offsets,
@@ -261,6 +262,18 @@ class ValueBlock {
    value->is_entry_ = state;
  }
+  void erase(uint64_t feasign) {
+    size_t hash = _hasher(feasign);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+    auto iter = table.find(feasign);
+    if (iter != table.end()) {
+      butil::return_object(iter->second);
+      iter = table.erase(iter);
+    }
+  }
  void Shrink(const int threshold) {
    for (auto &table : values_) {
      for (auto iter = table.begin(); iter != table.end();) {
@@ -289,6 +302,23 @@ class ValueBlock {
    }
  }
+  map_type::iterator end() {
+    return values_[SPARSE_SHARD_BUCKET_NUM - 1].end();
+  }
+  map_type::iterator Find(uint64_t id) {
+    size_t hash = _hasher(id);
+    size_t bucket = compute_bucket(hash);
+    auto &table = values_[bucket];
+    auto got = table.find(id);
+    if (got == table.end()) {
+      return end();
+    } else {
+      return got;
+    }
+  }
 private:
  bool Has(const uint64_t id) {
    size_t hash = _hasher(id);
@@ -304,7 +334,7 @@ class ValueBlock {
  }
 public:
-  robin_hood::unordered_map<uint64_t, VALUE *> values_[SPARSE_SHARD_BUCKET_NUM];
+  map_type values_[SPARSE_SHARD_BUCKET_NUM];
  size_t value_length_ = 0;
  std::hash<uint64_t> _hasher;

--- a/paddle/fluid/distributed/table/depends/rocksdb_warpper.h
+++ b/paddle/fluid/distributed/table/depends/rocksdb_warpper.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef PADDLE_WITH_HETERPS
+#include <glog/logging.h>
+#include <rocksdb/db.h>
+#include <rocksdb/filter_policy.h>
+#include <rocksdb/options.h>
+#include <rocksdb/slice.h>
+#include <rocksdb/table.h>
+#include <rocksdb/write_batch.h>
+#include <iostream>
+#include <string>
+namespace paddle {
+namespace distributed {
+class RocksDBHandler {
+ public:
+  RocksDBHandler() {}
+  ~RocksDBHandler() {}
+  static RocksDBHandler* GetInstance() {
+    static RocksDBHandler handler;
+    return &handler;
+  }
+  int initialize(const std::string& db_path, const int colnum) {
+    VLOG(3) << "db path: " << db_path << " colnum: " << colnum;
+    rocksdb::Options options;
+    rocksdb::BlockBasedTableOptions bbto;
+    bbto.block_size = 4 * 1024;
+    bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024);
+    bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024);
+    bbto.cache_index_and_filter_blocks = false;
+    bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false));
+    bbto.whole_key_filtering = true;
+    options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto));
+    options.keep_log_file_num = 100;
+    options.max_log_file_size = 50 * 1024 * 1024;  // 50MB
+    options.create_if_missing = true;
+    options.use_direct_reads = true;
+    options.max_background_flushes = 5;
+    options.max_background_compactions = 5;
+    options.base_background_compactions = 10;
+    options.write_buffer_size = 256 * 1024 * 1024;  // 256MB
+    options.max_write_buffer_number = 8;
+    options.max_bytes_for_level_base =
+        options.max_write_buffer_number * options.write_buffer_size;
+    options.min_write_buffer_number_to_merge = 1;
+    options.target_file_size_base = 1024 * 1024 * 1024;  // 1024MB
+    options.memtable_prefix_bloom_size_ratio = 0.02;
+    options.num_levels = 4;
+    options.max_open_files = -1;
+    options.compression = rocksdb::kNoCompression;
+    options.level0_file_num_compaction_trigger = 8;
+    options.level0_slowdown_writes_trigger =
+        1.8 * options.level0_file_num_compaction_trigger;
+    options.level0_stop_writes_trigger =
+        3.6 * options.level0_file_num_compaction_trigger;
+    if (!db_path.empty()) {
+      std::string rm_cmd = "rm -rf " + db_path;
+      system(rm_cmd.c_str());
+    }
+    rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db);
+    assert(s.ok());
+    _handles.resize(colnum);
+    for (int i = 0; i < colnum; i++) {
+      s = _db->CreateColumnFamily(options, "shard_" + std::to_string(i),
+                                  &_handles[i]);
+      assert(s.ok());
+    }
+    LOG(INFO) << "DB initialize success, colnum:" << colnum;
+    return 0;
+  }
+  int put(int id, const char* key, int key_len, const char* value,
+          int value_len) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::Status s =
+        _db->Put(options, _handles[id], rocksdb::Slice(key, key_len),
+                 rocksdb::Slice(value, value_len));
+    assert(s.ok());
+    return 0;
+  }
+  int put_batch(int id, std::vector<std::pair<char*, int>>& ssd_keys,
+                std::vector<std::pair<char*, int>>& ssd_values, int n) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::WriteBatch batch(n * 128);
+    for (int i = 0; i < n; i++) {
+      batch.Put(_handles[id],
+                rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second),
+                rocksdb::Slice(ssd_values[i].first, ssd_values[i].second));
+    }
+    rocksdb::Status s = _db->Write(options, &batch);
+    assert(s.ok());
+    return 0;
+  }
+  int get(int id, const char* key, int key_len, std::string& value) {
+    rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), _handles[id],
+                                 rocksdb::Slice(key, key_len), &value);
+    if (s.IsNotFound()) {
+      return 1;
+    }
+    assert(s.ok());
+    return 0;
+  }
+  int del_data(int id, const char* key, int key_len) {
+    rocksdb::WriteOptions options;
+    options.disableWAL = true;
+    rocksdb::Status s =
+        _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len));
+    assert(s.ok());
+    return 0;
+  }
+  int flush(int id) {
+    rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]);
+    assert(s.ok());
+    return 0;
+  }
+  rocksdb::Iterator* get_iterator(int id) {
+    return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]);
+  }
+  int get_estimate_key_num(uint64_t& num_keys) {
+    _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys);
+    return 0;
+  }
+ private:
+  std::vector<rocksdb::ColumnFamilyHandle*> _handles;
+  rocksdb::DB* _db;
+};
+}
+}
+#endif
--- a/paddle/fluid/distributed/table/ssd_sparse_table.cc
+++ b/paddle/fluid/distributed/table/ssd_sparse_table.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file");
+namespace paddle {
+namespace distributed {
+int32_t SSDSparseTable::initialize() {
+  _shards_task_pool.resize(task_pool_size_);
+  for (int i = 0; i < _shards_task_pool.size(); ++i) {
+    _shards_task_pool[i].reset(new ::ThreadPool(1));
+  }
+  sync = _config.common().sync();
+  VLOG(1) << "table " << _config.common().table_name() << " is sync: " << sync;
+  _global_lr = new float(1.0);
+  auto common = _config.common();
+  int size = static_cast<int>(common.params().size());
+  size_t offset = 0;
+  for (int x = 0; x < size; ++x) {
+    auto& varname = common.params()[x];
+    auto& dim = common.dims()[x];
+    value_idx_[varname] = x;
+    value_names_.push_back(varname);
+    value_dims_.push_back(dim);
+    value_offsets_.push_back(offset);
+    initializer_attrs_.push_back(common.initializers()[x]);
+    if (varname == "Param") {
+      param_dim_ = dim;
+      param_offset_ = offset;
+    }
+    offset += dim;
+  }
+  initialize_value();
+  initialize_optimizer();
+  initialize_recorder();
+  _db = paddle::distributed::RocksDBHandler::GetInstance();
+  _db->initialize(FLAGS_rocksdb_path, task_pool_size_);
+  return 0;
+}
+int32_t SSDSparseTable::pull_sparse(float* pull_values,
+                                    const PullSparseValue& pull_value) {
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, shard_num, &pull_value, &pull_values]() -> int {
+          auto& block = shard_values_[shard_id];
+          std::vector<int> offsets;
+          pull_value.Fission(shard_id, shard_num, &offsets);
+          for (auto& offset : offsets) {
+            auto feasign = pull_value.feasigns_[offset];
+            auto frequencie = pull_value.frequencies_[offset];
+            float* embedding = nullptr;
+            auto iter = block->Find(feasign);
+            // in mem
+            if (iter == block->end()) {
+              embedding = iter->second->data_.data();
+              if (pull_value.is_training_) {
+                block->AttrUpdate(iter->second, frequencie);
+              }
+            } else {
+              // need create
+              std::string tmp_str("");
+              if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
+                           tmp_str) > 0) {
+                embedding = block->Init(feasign, true, frequencie);
+              } else {
+                // in db
+                int data_size = tmp_str.size() / sizeof(float);
+                int value_size = block->value_length_;
+                float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
+                VALUE* value = block->InitGet(feasign);
+                // copy to mem
+                memcpy(value->data_.data(), db_value,
+                       value_size * sizeof(float));
+                embedding = db_value;
+                // param, count, unseen_day
+                value->count_ = db_value[value_size];
+                value->unseen_days_ = db_value[value_size + 1];
+                value->is_entry_ = db_value[value_size + 2];
+                if (pull_value.is_training_) {
+                  block->AttrUpdate(value, frequencie);
+                }
+              }
+            }
+            std::copy_n(embedding + param_offset_, param_dim_,
+                        pull_values + param_dim_ * offset);
+          }
+          return 0;
+        });
+  }
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+int32_t SSDSparseTable::pull_sparse_ptr(char** pull_values,
+                                        const uint64_t* keys, size_t num) {
+  auto shard_num = task_pool_size_;
+  std::vector<std::future<int>> tasks(shard_num);
+  std::vector<std::vector<uint64_t>> offset_bucket;
+  offset_bucket.resize(task_pool_size_);
+  for (int x = 0; x < num; ++x) {
+    auto y = keys[x] % task_pool_size_;
+    offset_bucket[y].push_back(x);
+  }
+  for (int shard_id = 0; shard_id < shard_num; ++shard_id) {
+    tasks[shard_id] = _shards_task_pool[shard_id]->enqueue(
+        [this, shard_id, &keys, &pull_values, &offset_bucket]() -> int {
+          auto& block = shard_values_[shard_id];
+          auto& offsets = offset_bucket[shard_id];
+          for (auto& offset : offsets) {
+            auto feasign = keys[offset];
+            auto iter = block->Find(feasign);
+            VALUE* value = nullptr;
+            // in mem
+            if (iter != block->end()) {
+              value = iter->second;
+            } else {
+              // need create
+              std::string tmp_str("");
+              if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t),
+                           tmp_str) > 0) {
+                value = block->InitGet(feasign);
+              } else {
+                // in db
+                int data_size = tmp_str.size() / sizeof(float);
+                int value_size = block->value_length_;
+                float* db_value = (float*)const_cast<char*>(tmp_str.c_str());
+                value = block->InitGet(feasign);
+                // copy to mem
+                memcpy(value->data_.data(), db_value,
+                       value_size * sizeof(float));
+                // param, count, unseen_day
+                value->count_ = db_value[value_size];
+                value->unseen_days_ = db_value[value_size + 1];
+                value->is_entry_ = db_value[value_size + 2];
+              }
+            }
+            pull_values[offset] = (char*)value;
+          }
+          return 0;
+        });
+  }
+  for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) {
+    tasks[shard_id].wait();
+  }
+  return 0;
+}
+int32_t SSDSparseTable::shrink(const std::string& param) { return 0; }
+int32_t SSDSparseTable::update_table() {
+  int count = 0;
+  int value_size = shard_values_[0]->value_length_;
+  int db_size = 3 + value_size;
+  float tmp_value[db_size];
+  for (size_t i = 0; i < task_pool_size_; ++i) {
+    auto& block = shard_values_[i];
+    for (auto& table : block->values_) {
+      for (auto iter = table.begin(); iter != table.end();) {
+        VALUE* value = iter->second;
+        if (value->unseen_days_ >= 1) {
+          tmp_value[value_size] = value->count_;
+          tmp_value[value_size + 1] = value->unseen_days_;
+          tmp_value[value_size + 2] = value->is_entry_;
+          memcpy(tmp_value, value->data_.data(), sizeof(float) * value_size);
+          _db->put(i, (char*)&(iter->first), sizeof(uint64_t), (char*)tmp_value,
+                   db_size * sizeof(float));
+          count++;
+          butil::return_object(iter->second);
+          iter = table.erase(iter);
+        } else {
+          ++iter;
+        }
+      }
+    }
+    _db->flush(i);
+  }
+  VLOG(1) << "Table>> update count: " << count;
+  return 0;
+}
+int64_t SSDSparseTable::SaveValueToText(std::ostream* os,
+                                        std::shared_ptr<ValueBlock> block,
+                                        std::shared_ptr<::ThreadPool> pool,
+                                        const int mode, int shard_id) {
+  int64_t save_num = 0;
+  for (auto& table : block->values_) {
+    for (auto& value : table) {
+      if (mode == SaveMode::delta && !value.second->need_save_) {
+        continue;
+      }
+      ++save_num;
+      std::stringstream ss;
+      auto* vs = value.second->data_.data();
+      auto id = value.first;
+      ss << id << "\t" << value.second->count_ << "\t"
+         << value.second->unseen_days_ << "\t" << value.second->is_entry_
+         << "\t";
+      for (int i = 0; i < block->value_length_ - 1; i++) {
+        ss << std::to_string(vs[i]) << ",";
+      }
+      ss << std::to_string(vs[block->value_length_ - 1]);
+      ss << "\n";
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+      if (mode == SaveMode::base || mode == SaveMode::delta) {
+        value.second->need_save_ = false;
+      }
+    }
+  }
+  if (mode != 1) {
+    int value_size = block->value_length_;
+    auto* it = _db->get_iterator(shard_id);
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      float* value = (float*)const_cast<char*>(it->value().data());
+      std::stringstream ss;
+      ss << *((uint64_t*)const_cast<char*>(it->key().data())) << "\t"
+         << value[value_size] << "\t" << value[value_size + 1] << "\t"
+         << value[value_size + 2] << "\t";
+      for (int i = 0; i < block->value_length_ - 1; i++) {
+        ss << std::to_string(value[i]) << ",";
+      }
+      ss << std::to_string(value[block->value_length_ - 1]);
+      ss << "\n";
+      os->write(ss.str().c_str(), sizeof(char) * ss.str().size());
+    }
+  }
+  return save_num;
+}
+int32_t SSDSparseTable::load(const std::string& path,
+                             const std::string& param) {
+  rwlock_->WRLock();
+  VLOG(3) << "ssd sparse table load with " << path << " with meta " << param;
+  LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_,
+               &shard_values_);
+  rwlock_->UNLock();
+  return 0;
+}
+int64_t SSDSparseTable::LoadFromText(
+    const std::string& valuepath, const std::string& metapath,
+    const int pserver_id, const int pserver_num, const int local_shard_num,
+    std::vector<std::shared_ptr<ValueBlock>>* blocks) {
+  Meta meta = Meta(metapath);
+  int num_lines = 0;
+  std::ifstream file(valuepath);
+  std::string line;
+  int value_size = shard_values_[0]->value_length_;
+  int db_size = 3 + value_size;
+  float tmp_value[db_size];
+  while (std::getline(file, line)) {
+    auto values = paddle::string::split_string<std::string>(line, "\t");
+    auto id = lexical_cast<uint64_t>(values[0]);
+    if (id % pserver_num != pserver_id) {
+      VLOG(3) << "will not load " << values[0] << " from " << valuepath
+              << ", please check id distribution";
+      continue;
+    }
+    auto shard_id = id % local_shard_num;
+    auto block = blocks->at(shard_id);
+    std::vector<std::vector<float>> kvalues;
+    ProcessALine(values, meta, id, &kvalues);
+    block->Init(id, false);
+    VALUE* value_instant = block->GetValue(id);
+    if (values.size() == 5) {
+      value_instant->count_ = lexical_cast<int>(values[1]);
+      value_instant->unseen_days_ = lexical_cast<int>(values[2]);
+      value_instant->is_entry_ =
+          static_cast<bool>(lexical_cast<int>(values[3]));
+    }
+    std::vector<float*> block_values = block->Get(id, meta.names, meta.dims);
+    auto blas = GetBlas<float>();
+    for (int x = 0; x < meta.names.size(); ++x) {
+      blas.VCOPY(meta.dims[x], kvalues[x].data(), block_values[x]);
+    }
+    VLOG(3) << "loading: " << id
+            << "unseen day: " << value_instant->unseen_days_;
+    if (value_instant->unseen_days_ >= 1) {
+      tmp_value[value_size] = value_instant->count_;
+      tmp_value[value_size + 1] = value_instant->unseen_days_;
+      tmp_value[value_size + 2] = value_instant->is_entry_;
+      memcpy(tmp_value, value_instant->data_.data(),
+             sizeof(float) * value_size);
+      _db->put(shard_id, (char*)&(id), sizeof(uint64_t), (char*)tmp_value,
+               db_size * sizeof(float));
+      block->erase(id);
+    }
+  }
+  return 0;
+}
+}  // namespace ps
+}  // namespace paddle
+#endif
--- a/paddle/fluid/distributed/table/ssd_sparse_table.h
+++ b/paddle/fluid/distributed/table/ssd_sparse_table.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/distributed/table/common_sparse_table.h"
+#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h"
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace distributed {
+class SSDSparseTable : public CommonSparseTable {
+ public:
+  SSDSparseTable() {}
+  virtual ~SSDSparseTable() {}
+  virtual int32_t initialize() override;
+  void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common,
+                      const size_t shard_idx, const int64_t total);
+  int64_t SaveValueToText(std::ostream* os, std::shared_ptr<ValueBlock> block,
+                          std::shared_ptr<::ThreadPool> pool, const int mode,
+                          int shard_id);
+  virtual int64_t LoadFromText(
+      const std::string& valuepath, const std::string& metapath,
+      const int pserver_id, const int pserver_num, const int local_shard_num,
+      std::vector<std::shared_ptr<ValueBlock>>* blocks);
+  virtual int32_t load(const std::string& path, const std::string& param);
+  // exchange data
+  virtual int32_t update_table();
+  virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value);
+  virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys,
+                                  size_t num);
+  virtual int32_t flush() override { return 0; }
+  virtual int32_t shrink(const std::string& param) override;
+  virtual void clear() override {}
+ private:
+  RocksDBHandler* _db;
+  int64_t _cache_tk_size;
+};
+}  // namespace ps
+}  // namespace paddle
+#endif
--- a/paddle/fluid/distributed/table/table.cc
+++ b/paddle/fluid/distributed/table/table.cc
@@ -21,6 +21,9 @@
 #include "paddle/fluid/distributed/table/common_graph_table.h"
 #include "paddle/fluid/distributed/table/common_sparse_table.h"
 #include "paddle/fluid/distributed/table/sparse_geo_table.h"
+#ifdef PADDLE_WITH_HETERPS
+#include "paddle/fluid/distributed/table/ssd_sparse_table.h"
+#endif
 #include "paddle/fluid/distributed/table/tensor_accessor.h"
 #include "paddle/fluid/distributed/table/tensor_table.h"
@@ -29,6 +32,9 @@ namespace distributed {
 REGISTER_PSCORE_CLASS(Table, GraphTable);
 REGISTER_PSCORE_CLASS(Table, CommonDenseTable);
 REGISTER_PSCORE_CLASS(Table, CommonSparseTable);
+#ifdef PADDLE_WITH_HETERPS
+REGISTER_PSCORE_CLASS(Table, SSDSparseTable);
+#endif
 REGISTER_PSCORE_CLASS(Table, SparseGeoTable);
 REGISTER_PSCORE_CLASS(Table, BarrierTable);
 REGISTER_PSCORE_CLASS(Table, TensorTable);

--- a/paddle/fluid/extension/include/ext_dtype.h
+++ b/paddle/fluid/extension/include/ext_dtype.h
@@ -16,15 +16,14 @@ limitations under the License. */
 #include <cstdint>
 #include <string>
-#include "complex128.h"     // NOLINT
+#include "complex.h"        // NOLINT
-#include "complex64.h"      // NOLINT
 #include "ext_exception.h"  // NOLINT
 #include "float16.h"        // NOLINT
 namespace paddle {
-using complex64 = paddle::platform::complex64;
+using complex64 = paddle::platform::complex<float>;
-using complex128 = paddle::platform::complex128;
+using complex128 = paddle::platform::complex<double>;
 using float16 = paddle::platform::float16;
 enum class DataType {

--- a/paddle/fluid/extension/src/ext_tensor.cc
+++ b/paddle/fluid/extension/src/ext_tensor.cc
@@ -19,8 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
@@ -238,9 +237,9 @@ template PD_DLL_DECL Tensor
 Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<bool>(const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<float>>(
    const PlaceType &target_place) const;
-template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex<double>>(
    const PlaceType &target_place) const;
 template PD_DLL_DECL Tensor
 Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
@@ -253,10 +252,10 @@ template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
 template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
 template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
 template PD_DLL_DECL bool *Tensor::data<bool>() const;
-template PD_DLL_DECL paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex<float>
-Tensor::data<paddle::platform::complex64>() const;
+    *Tensor::data<paddle::platform::complex<float>>() const;
-template PD_DLL_DECL paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex<double>
-Tensor::data<paddle::platform::complex128>() const;
+    *Tensor::data<paddle::platform::complex<double>>() const;
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::data<paddle::platform::float16>() const;
@@ -268,10 +267,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
 template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
-template PD_DLL_DECL paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex<float>
-Tensor::mutable_data<paddle::platform::complex64>();
+    *Tensor::mutable_data<paddle::platform::complex<float>>();
-template PD_DLL_DECL paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex<double>
-Tensor::mutable_data<paddle::platform::complex128>();
+    *Tensor::mutable_data<paddle::platform::complex<double>>();
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
@@ -289,10 +288,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
 template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
    const PlaceType &place);
 template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex<float> *
-Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+Tensor::mutable_data<paddle::platform::complex<float>>(const PlaceType &place);
-template PD_DLL_DECL paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex<double> *
-Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+Tensor::mutable_data<paddle::platform::complex<double>>(const PlaceType &place);
 template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
@@ -356,13 +355,13 @@ Tensor Tensor::cast(const DataType &target_type) const {
          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
      break;
    case framework::proto::VarType::COMPLEX64:
-      framework::VisitDataType(
+      framework::VisitDataType(dst_type,
-          dst_type,
+                               CastDataType<paddle::platform::complex<float>>(
-          CastDataType<paddle::platform::complex64>(*tensor, rlt_tensor_, ctx));
+                                   *tensor, rlt_tensor_, ctx));
      break;
    case framework::proto::VarType::COMPLEX128:
      framework::VisitDataType(dst_type,
-                               CastDataType<paddle::platform::complex128>(
+                               CastDataType<paddle::platform::complex<double>>(
                                   *tensor, rlt_tensor_, ctx));
      break;
    case framework::proto::VarType::FP16:

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -27,7 +27,22 @@ add_subdirectory(fleet)
 add_subdirectory(io)
 #ddim lib
 proto_library(framework_proto SRCS framework.proto)
 proto_library(op_def_proto SRCS op_def.proto)
+cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto)
+FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt)
+FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt 
+     "namespace { \n"
+     "const std::unordered_map<std::string, std::string> op_def_map =  { \n")
+foreach(OP_DEF_FILE ${OP_DEF_FILES})
+    FILE(READ ${OP_DEF_FILE}  OP_DEF_CONTENT)
+    get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE)
+    FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt
+    "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n")
+endforeach(OP_DEF_FILE)
+FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}")
 proto_library(heter_service_proto SRCS heter_service.proto)
 proto_library(data_feed_proto SRCS data_feed.proto)
 proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto

--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -109,9 +109,9 @@ void GroupTestCopy() {
  TestCopyTensor<int8_t>();
  VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<uint8_t>();
-  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  VLOG(2) << "complex<float> cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::complex64>();
-  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  VLOG(2) << "complex<double> cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::complex128>();
  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
  TestCopyTensor<paddle::float16>();
@@ -132,9 +132,9 @@ void GroupTestCast() {
  TestCast<uint8_t>(paddle::DataType::FLOAT32);
  VLOG(2) << "float cast";
  TestCast<float>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex64 cast";
+  VLOG(2) << "complex<float> cast";
  TestCast<paddle::complex64>(paddle::DataType::FLOAT32);
-  VLOG(2) << "complex128 cast";
+  VLOG(2) << "complex<double> cast";
  TestCast<paddle::complex128>(paddle::DataType::FLOAT32);
  VLOG(2) << "float16 cast";
  TestCast<paddle::float16>(paddle::DataType::FLOAT16);

--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place,
      platform::errors::Unavailable("Currently, model parallelism is only "
                                    "supported between CPU and CUDA."));
+  // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync.
+  if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) {
+    TensorCopy(in, dst_place,
+               *platform::DeviceContextPool::Instance().Get(dst_place), out);
+    return;
+  }
  // NOTE(yy): TransDataDevice should wait for computation of input.
  if (!platform::is_cuda_pinned_place(in.place())) {
    platform::DeviceContextPool::Instance().Get(in.place())->Wait();

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -638,25 +638,34 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe(
    const char* str = reader.get();
    std::string line = std::string(str);
-    // VLOG(3) << line;
    char* endptr = const_cast<char*>(str);
    int pos = 0;
    for (size_t i = 0; i < use_slots_index_.size(); ++i) {
      int idx = use_slots_index_[i];
      int num = strtol(&str[pos], &endptr, 10);
-      PADDLE_ENFORCE_NE(
-          num, 0,
+      if (num <= 0) {
-          platform::errors::InvalidArgument(
+        std::stringstream ss;
-              "The number of ids can not be zero, you need padding "
+        ss << "\n\nGot unexpected input, maybe something wrong with it.\n";
-              "it in data generator; or if there is something wrong with "
+        ss << "\n----------------------\n";
-              "the data, please check if the data contains unresolvable "
+        ss << "The Origin Input Data:\n";
-              "characters.\nplease check this error line: %s, \n Specifically, "
+        ss << "----------------------\n";
-              "something wrong happened(the length of this slot's feasign is 0)"
-              "when we parse the %d th slots."
+        ss << line << "\n";
-              "Maybe something wrong around this slot"
-              "\nWe detect the feasign number of this slot is %d, "
+        ss << "\n----------------------\n";
-              "which is illegal.",
+        ss << "Some Possible Errors:\n";
-              str, i, num));
+        ss << "----------------------\n";
+        ss << "1. The number of ids can not be zero, you need padding.\n";
+        ss << "2. The input data contains unresolvable characters.\n";
+        ss << "3. We detect the slot " << i << "'s feasign number is " << num
+           << " which is illegal.\n";
+        ss << "\n";
+        PADDLE_THROW(platform::errors::InvalidArgument(ss.str()));
+      }
      if (idx != -1) {
        (*instance)[idx].Init(all_slots_type_[i]);
        if ((*instance)[idx].GetType()[0] == 'f') {  // float

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/eigen_ext.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
@@ -28,8 +26,8 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 struct bfloat16;
-struct complex128;
+template <typename T>
-struct complex64;
+struct complex;
 struct float16;
 template <typename T>
 struct complex;
@@ -53,35 +51,31 @@ struct DataTypeTrait<void> {
 #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \
  callback(cpp_type, ::paddle::framework::proto::VarType::proto_type);
-#define _ForEachDataType_(callback)                                            \
+#define _ForEachDataType_(callback)                                      \
-  _ForEachDataTypeHelper_(callback, float, FP32);                              \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);  \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16);       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \
-  _ForEachDataTypeHelper_(callback, double, FP64);                             \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
-  _ForEachDataTypeHelper_(callback, int, INT32);                               \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
-  _ForEachDataTypeHelper_(callback, bool, BOOL);                               \
+  _ForEachDataTypeHelper_(callback, bool, BOOL);                         \
-  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                           \
+  _ForEachDataTypeHelper_(callback, uint8_t, UINT8);                     \
-  _ForEachDataTypeHelper_(callback, int16_t, INT16);                           \
+  _ForEachDataTypeHelper_(callback, int16_t, INT16);                     \
-  _ForEachDataTypeHelper_(callback, int8_t, INT8);                             \
+  _ForEachDataTypeHelper_(callback, int8_t, INT8);                       \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
-                          COMPLEX64);                                          \
+                          COMPLEX64);                                    \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
-                          COMPLEX128);                                         \
+                          COMPLEX128);
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
+#define _ForEachDataTypeSmall_(callback)                                 \
+  _ForEachDataTypeHelper_(callback, float, FP32);                        \
-#define _ForEachDataTypeSmall_(callback)                                       \
+  _ForEachDataTypeHelper_(callback, double, FP64);                       \
-  _ForEachDataTypeHelper_(callback, float, FP32);                              \
+  _ForEachDataTypeHelper_(callback, int, INT32);                         \
-  _ForEachDataTypeHelper_(callback, double, FP64);                             \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64);                     \
-  _ForEachDataTypeHelper_(callback, int, INT32);                               \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,  \
-  _ForEachDataTypeHelper_(callback, int64_t, INT64);                           \
+                          COMPLEX64);                                    \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<float>,        \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>, \
-                          COMPLEX64);                                          \
+                          COMPLEX128);
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex<double>,       \
-                          COMPLEX128);                                         \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \
-  _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128);
 // For the use of thrust, as index-type elements can be only integers.
 #define _ForEachDataTypeTiny_(callback)          \

--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -119,12 +119,12 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
  // complex -> real
  switch (src_type) {
    case proto::VarType::COMPLEX64:
-      framework::VisitDataType(dst_type,
+      framework::VisitDataType(
-                               CastDataType<platform::complex64>(in, out, ctx));
+          dst_type, CastDataType<platform::complex<float>>(in, out, ctx));
      break;
    case proto::VarType::COMPLEX128:
      framework::VisitDataType(
-          dst_type, CastDataType<platform::complex128>(in, out, ctx));
+          dst_type, CastDataType<platform::complex<double>>(in, out, ctx));
      break;
    default:
      PADDLE_THROW(platform::errors::Unimplemented(

--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -159,10 +159,6 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num,
 #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in)
 #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \
                              omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex64 : omp_out += \
-                              omp_in)
-#pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \
-                              omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex < \
                                  float > : omp_out += omp_in)
 #pragma omp declare reduction(+ : paddle::platform::complex < \
@@ -222,58 +218,6 @@ void CheckNanInf<paddle::platform::bfloat16>(
  }
 }
-template <>
-void CheckNanInf<paddle::platform::complex64>(
-    const paddle::platform::complex64* value, const size_t numel, int print_num,
-    const std::string& op_type, const std::string& var_name) {
-  float real_sum = 0.0f;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
-  float imag_sum = 0.0f;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
-  }
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
-        op_type));
-  }
-}
-template <>
-void CheckNanInf<paddle::platform::complex128>(
-    const paddle::platform::complex128* value, const size_t numel,
-    int print_num, const std::string& op_type, const std::string& var_name) {
-  double real_sum = 0.0;
-#pragma omp parallel for reduction(+ : real_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    real_sum += (value[i].real - value[i].real);
-  }
-  double imag_sum = 0.0;
-#pragma omp parallel for reduction(+ : imag_sum)
-  for (size_t i = 0; i < numel; ++i) {
-    imag_sum += (value[i].imag - value[i].imag);
-  }
-  if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) ||
-      std::isinf(imag_sum)) {
-    // hot fix for compile failed in gcc4.8
-    // here also need print detail info of nan or inf later
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "There are `nan` or `inf` in tensor (%s) of operator (%s).", var_name,
-        op_type));
-  }
-}
 template <>
 void CheckNanInf<paddle::platform::complex<float>>(
    const paddle::platform::complex<float>* value, const size_t numel,

--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -29,9 +29,7 @@ template <typename T>
 static ::DLDataType GetDLDataTypeCode() {
  ::DLDataType dtype;
  if (std::is_same<T, platform::complex<float>>::value ||
-      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex<double>>::value) {
-      std::is_same<T, platform::complex64>::value ||
-      std::is_same<T, platform::complex128>::value) {
    // The current dlpack library version is v0.2, and does not define
    // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set
    // dtype.code to 5U directly here. After the dlpack library version being

--- a/paddle/fluid/framework/dlpack_tensor_test.cc
+++ b/paddle/fluid/framework/dlpack_tensor_test.cc
@@ -29,9 +29,7 @@ namespace {  // NOLINT
 template <typename T>
 constexpr uint8_t GetDLDataTypeCode() {
  if (std::is_same<T, platform::complex<float>>::value ||
-      std::is_same<T, platform::complex<double>>::value ||
+      std::is_same<T, platform::complex<double>>::value) {
-      std::is_same<T, platform::complex64>::value ||
-      std::is_same<T, platform::complex128>::value) {
    return static_cast<uint8_t>(5);
  }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -50,7 +50,7 @@ if (WITH_TESTING)
 endif(WITH_TESTING)
 cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS})
-cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector)
+cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api)
 cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor)
 cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass)
 cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass)
@@ -171,7 +171,7 @@ if (WITH_MKLDNN)
    cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
    cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
    cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
-    set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context)
+    set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function)
 if (WITH_GPU OR WITH_ROCM)
    set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
 endif()

--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
 #include <string>
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -23,6 +23,65 @@ namespace paddle {
 namespace framework {
 namespace ir {
+FCFusePass::FCFusePass() {
+  AddOpCompat(OpCompat("mul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("x_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("y_num_col_dims")
+      .End();
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+  AddOpCompat(OpCompat("relu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End();
+  AddOpCompat(OpCompat("fc"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("W")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("in_num_col_dims")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("activation_type")
+      .IsStringIn({"relu", ""})
+      .End();
+}
 void FCFusePass::ApplyImpl(ir::Graph* graph) const {
  PADDLE_ENFORCE_NOT_NULL(
      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
@@ -52,6 +111,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
      LOG(WARNING) << "The subgraph is empty.";
      return;
    }
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
    VLOG(4) << "handle FC fuse";
    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
@@ -159,6 +222,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
    }
    desc.Flush();
+    if (!IsCompat(desc)) {
+      LOG(WARNING) << "Fc fuse pass in out fc op compat failed.";
+      return;
+    }
    auto fc_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
    if (with_relu) {
      GraphSafeRemoveNodes(

--- a/paddle/fluid/framework/ir/fc_fuse_pass.h
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.h
@@ -30,6 +30,7 @@ class Graph;
 class FCFusePass : public FusePassBase {
 public:
+  FCFusePass();
  virtual ~FCFusePass() {}
 protected:

--- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -15,4 +15,4 @@ cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_
 cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass)
-cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op)
+cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function)
--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc
@@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <memory>
 #include "paddle/fluid/framework/ir/op_compat_sensible_pass.h"
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include "paddle/fluid/framework/op_def_api.h"
 #include "paddle/fluid/framework/op_info.h"
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -50,18 +53,17 @@ AttrCompat& AttrCompat::IsIntIn(const std::set<int>& candidates) {
  return *this;
 }
-//! Todo: append the definition.
 AttrCompat& AttrCompat::IsLeftDefault() {
  const std::string& op_name = op_compat_->Name();
  if (!OpInfoMap::Instance().Has(op_name)) {
-    VLOG(3) << "Op (" << op_name << ") is not registered!";
+    LOG(WARNING) << "Op (" << op_name << ") is not registered!";
    conditions_.emplace_back([](const Attribute& attr) { return false; });
    return *this;
  }
  const OpInfo& op_info = OpInfoMap::Instance().Get(op_name);
  const AttributeMap attrs = op_info.Checker()->GetAttrsDefaultValuesMap();
  if (attrs.find(attr_name_) == attrs.end()) {
-    VLOG(3) << "Op (" << op_name << ") has no default attr:" << attr_name_;
+    LOG(WARNING) << "Op (" << op_name << ") has no default attr:" << attr_name_;
    conditions_.emplace_back([](const Attribute& attr) { return false; });
  } else {
    Attribute default_attr = attrs.at(attr_name_);
@@ -77,6 +79,10 @@ bool AttrCompat::operator()(const OpDesc& op_desc) {
    return true;
  }
  if (!op_desc.HasAttr(attr_name_)) {
+    if (!optional_) {
+      LOG(WARNING) << "The non-optional Attr(" << attr_name_ << ") of Op ("
+                   << op_compat_->Name() << ") not find ! ";
+    }
    return optional_;
  }
  const Attribute attr = op_desc.GetAttr(attr_name_);
@@ -149,19 +155,35 @@ InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) {
 }
 bool OpCompat::Judge(const OpDesc& op_desc) {
+  if (is_first_judge_) {
+    is_first_judge_ = false;
+    const proto::OpDef& op_def = GetOpDef(op_name_);
+    if (op_def.has_extra()) {
+      for (const proto::OpDef_AttrDef& attr : op_def.extra().attrs()) {
+        extra_attrs_.emplace(attr.name());
+      }
+    }
+  }
  for (auto& attr_map : op_desc.GetAttrMap()) {
    if (attr_compats_.find(attr_map.first) == attr_compats_.end()) {
+      if (extra_attrs_.find(attr_map.first) != extra_attrs_.end()) {
+        continue;
+      }
      if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) {
-        VLOG(3) << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+        LOG(WARNING)
-                << ") not reigistered in OpCompat, not equal to default value!";
+            << "The Attr(" << attr_map.first << ") of Op (" << op_name_
+            << ") not reigistered in OpCompat, not in extra attribute, not "
+               "equal to default value!";
        return false;
      }
    }
  }
  for (auto& attr_compat : attr_compats_) {
    if (!attr_compat.second(op_desc)) {
-      VLOG(3) << " Check the Attr(" << attr_compat.first << ") of Op("
+      LOG(WARNING) << " Check the Attr(" << attr_compat.first << ") of Op("
-              << op_name_ << ") failed!";
+                   << op_name_ << ") failed!";
      return false;
    }
  }
@@ -170,8 +192,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
  for (auto& input_desc : inputs_map) {
    if (input_compats_.find(input_desc.first) == input_compats_.end()) {
      if (!input_desc.second.empty()) {
-        VLOG(3) << "The Input (" << input_desc.first << ") of Operator ("
+        LOG(WARNING) << "The Input (" << input_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+                     << op_name_ << ") not reigistered in OpCompat!";
        return false;
      }
    }
@@ -179,14 +201,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
  for (auto& input_val : input_compats_) {
    if (inputs_map.find(input_val.first) == inputs_map.end()) {
      if (!input_val.second.Optional()) {
-        VLOG(3) << "The No optional Input (" << input_val.first
+        LOG(WARNING) << "The No optional Input (" << input_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
        return false;
      }
    } else {
      if (!input_val.second(inputs_map.at(input_val.first))) {
-        VLOG(3) << "The Input (" << input_val.first << ") of Operator ("
+        LOG(WARNING) << "The Input (" << input_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+                     << op_name_ << ") compat check failed!";
        return false;
      }
    }
@@ -196,8 +219,8 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
  for (auto& output_desc : outputs_map) {
    if (output_compats_.find(output_desc.first) == output_compats_.end()) {
      if (!output_desc.second.empty()) {
-        VLOG(3) << "The Output (" << output_desc.first << ") of Operator ("
+        LOG(WARNING) << "The Output (" << output_desc.first << ") of Operator ("
-                << op_name_ << ") not reigistered in OpCompat!";
+                     << op_name_ << ") not reigistered in OpCompat!";
        return false;
      }
    }
@@ -205,14 +228,15 @@ bool OpCompat::Judge(const OpDesc& op_desc) {
  for (auto& output_val : output_compats_) {
    if (outputs_map.find(output_val.first) == outputs_map.end()) {
      if (!output_val.second.Optional()) {
-        VLOG(3) << "The No optional Output (" << output_val.first
+        LOG(WARNING) << "The No optional Output (" << output_val.first
-                << ") of Operator (" << op_name_ << ") not find in op_desc!";
+                     << ") of Operator (" << op_name_
+                     << ") not find in op_desc!";
        return false;
      }
    } else {
      if (!output_val.second(outputs_map.at(output_val.first))) {
-        VLOG(3) << "The Output (" << output_val.first << ") of Operator ("
+        LOG(WARNING) << "The Output (" << output_val.first << ") of Operator ("
-                << op_name_ << ") compat check failed!";
+                     << op_name_ << ") compat check failed!";
        return false;
      }
    }

--- a/paddle/fluid/framework/ir/op_compat_sensible_pass.h
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h
@@ -140,6 +140,8 @@ class OpCompat {
  std::unordered_map<std::string, AttrCompat> attr_compats_;
  std::unordered_map<std::string, InputOrOutputCompat> input_compats_;
  std::unordered_map<std::string, InputOrOutputCompat> output_compats_;
+  std::unordered_set<std::string> extra_attrs_;
+  bool is_first_judge_ = true;
 };
 /**
@@ -203,6 +205,7 @@ class OpCompatSensiblePass : public Pass {
      if (!node_pair.second->IsOp()) continue;
      auto op_type = node_pair.second->Op()->Type();
      if (!op_compat_judgers_.count(op_type)) {
+        LOG(WARNING) << op_type << "compat not registered!";
        return false;
      }
      auto& judger = *op_compat_judgers_.at(op_type);

--- a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
+++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc
@@ -27,7 +27,6 @@ TEST(OpCompatSensiblePass, compatOp) {
  compat.AddAttr("in_num_col_dims")
      .IsIntIn({1, 2})
      .IsNumLE(1)
-      .IsLeftDefault()
      .End()
      .AddAttr("activation_type")
      .IsStringIn({"tanh", "sigmoid"})
@@ -68,7 +67,7 @@ TEST(OpCompatSensiblePass, compatOp) {
  fc_op.SetOutput("Out", std::vector<std::string>{"test_output"});
  EXPECT_STREQ(compat.Name().c_str(), "fc");
-  EXPECT_FALSE(compat.Judge(fc_op));
+  EXPECT_TRUE(compat.Judge(fc_op));
 }
 TEST(OpCompatSensiblePass, compatOpAttribute) {
@@ -92,6 +91,18 @@ TEST(OpCompatSensiblePass, compatOpAttribute) {
  delete info.checker_;
 }
+TEST(OpCompatSensiblePass, opDefNotFound) {
+  OpCompat compat("fc_1");
+  OpDesc fc_op;
+  compat.Judge(fc_op);
+  OpCompat compat_1("");
+  compat_1.Judge(fc_op);
+}
 TEST(OpCompatSensiblePass, compatOpAttributeOptional) {
  OpCompat compat("fc");
  compat.AddAttr("activation_type")

--- a/paddle/fluid/framework/op_def_api.cc
+++ b/paddle/fluid/framework/op_def_api.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+#include "paddle/fluid/framework/op_def_api.h"
+#include <fstream>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#ifdef _LINUX
+#include <stdio_ext.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#endif
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/text_format.h>
+#include "glog/logging.h"
+#include "io/fs.h"
+#include "paddle/fluid/framework/op_def.pb.h"
+/*
+// op_def.pbtxt
+namespace {
+ const std::unordered_map<std::string, std::std::string> op_def_map = {...};
+}
+*/
+#include "paddle/fluid/framework/op_def.pbtxt"  //NOLINT
+namespace paddle {
+namespace framework {
+const proto::OpDef& GetOpDef(const std::string& op_name) {
+  static std::unordered_map<std::string, proto::OpDef> ops_definition;
+  static std::mutex mtx;
+  if (ops_definition.find(op_name) == ops_definition.end()) {
+    std::lock_guard<std::mutex> lk(mtx);
+    if (ops_definition.find(op_name) == ops_definition.end()) {
+      proto::OpDef op_def;
+      if (op_def_map.find(op_name) == op_def_map.end()) {
+        LOG(WARNING) << op_name << ".pbtxt not exist!";
+      } else {
+        if (!::google::protobuf::TextFormat::ParseFromString(
+                op_def_map.at(op_name), &op_def)) {
+          LOG(WARNING) << "Failed to parse " << op_name;
+        }
+      }
+      if (op_def.type() != op_name) {
+        LOG(WARNING) << op_name << ".pbtxt has error type :" << op_def.type();
+        ops_definition.emplace(std::make_pair(op_name, proto::OpDef()));
+      } else {
+        ops_definition.emplace(std::make_pair(op_name, std::move(op_def)));
+      }
+    }
+  }
+  return ops_definition.at(op_name);
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/operators/increment_op.cu
+++ b/paddle/fluid/operators/increment_op.cu
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/increment_op.h"
+#pragma once
-namespace ops = paddle::operators;
+#include "paddle/fluid/framework/op_def.pb.h"
-REGISTER_OP_CUDA_KERNEL(
-    increment, ops::IncrementKernel<paddle::platform::CUDADeviceContext, float>,
+namespace paddle {
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, double>,
+namespace framework {
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int>,
+const proto::OpDef& GetOpDef(const std::string& op_name);
-    ops::IncrementKernel<paddle::platform::CUDADeviceContext, int64_t>);
+}
+}
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -317,8 +317,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
      __VA_ARGS__)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+#else
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
+#endif
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1532,7 +1532,12 @@ Scope* OperatorWithKernel::PrepareData(
  // the rest iterations to save the elapsed time.
  // We do not support skipping PrepareData in while block, because the Op's
  // input may be changed by subsequent Ops, which may cause an error.
-  if (pre_scope_ == &scope && new_scope == nullptr) {
+  // For inference, ops that behind conditional branch aren't supported well,
+  // so disable prepare optimization conservatively.
+  bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
+                            Attr<bool>("inference_force_prepare_data");
+  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
    need_prepare_data_ = false;
  }

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -22,8 +22,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/profiler.h"
 namespace paddle {
@@ -1137,9 +1136,9 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
 }
 template <>
-std::ostream& print_tensor<paddle::platform::complex64>(
+std::ostream& print_tensor<paddle::platform::complex<float>>(
    std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<paddle::platform::complex64>();
+  auto inspect = tensor.data<paddle::platform::complex<float>>();
  auto element_num = tensor.numel();
  os << "  - data: [";
@@ -1155,9 +1154,9 @@ std::ostream& print_tensor<paddle::platform::complex64>(
 }
 template <>
-std::ostream& print_tensor<paddle::platform::complex128>(
+std::ostream& print_tensor<paddle::platform::complex<double>>(
    std::ostream& os, const framework::Tensor& tensor) {
-  auto inspect = tensor.data<paddle::platform::complex128>();
+  auto inspect = tensor.data<paddle::platform::complex<double>>();
  auto element_num = tensor.numel();
  os << "  - data: [";

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -24,8 +24,7 @@
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -200,8 +199,8 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
  PADDLE_TENSOR_ADD(double);
  // NOTE(chenweihang): only support complex grad tensor accumulated,
  // support selected rows if needed in the future
-  PADDLE_TENSOR_ADD(platform::complex64);
+  PADDLE_TENSOR_ADD(platform::complex<float>);
-  PADDLE_TENSOR_ADD(platform::complex128);
+  PADDLE_TENSOR_ADD(platform::complex<double>);
 #endif
 #undef PADDLE_TENSOR_ADD

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -270,7 +270,46 @@ bool AnalysisPredictor::CreateExecutor() {
  executor_.reset(new paddle::framework::NaiveExecutor(place_));
  return true;
 }
+static bool IsPrepareDataOptTargetOp(framework::OpDesc *op) {
+  // here is prepare data optimization related bad cases:
+  // let's assume an op behind conditional_block and if conditional_block
+  // chooses branch 1, the op need to call prepare data. else the op don't need
+  // to call prepare data. In running, if predictor chooses branch 2, then
+  // optimization takes effect, later issue is followed if predictor chooses
+  // branch 1, because the op lost chance to prepare data.
+  std::vector<std::string> op_type = {"conditional_block_infer",
+                                      "select_input"};
+  for (const auto &type : op_type) {
+    if (op->Type() == type) {
+      return true;
+    }
+  }
+  return false;
+}
+static void DisablePrepareDataOpt(
+    std::shared_ptr<framework::ProgramDesc> inference_program, int block,
+    bool pre_disable_opt) {
+  bool disable_opt = false;
+  auto &infer_block = inference_program->Block(block);
+  for (auto *op : infer_block.AllOps()) {
+    if (disable_opt || pre_disable_opt) {
+      op->SetAttr("inference_force_prepare_data", true);
+    }
+    if (op->HasAttr("sub_block")) {
+      int blockID = op->GetBlockAttrId("sub_block");
+      DisablePrepareDataOpt(inference_program, blockID,
+                            disable_opt || pre_disable_opt);
+    }
+    // disable prepare data if unfriendly op is found
+    disable_opt = IsPrepareDataOptTargetOp(op);
+  }
+}
 bool AnalysisPredictor::PrepareExecutor() {
+  DisablePrepareDataOpt(inference_program_, 0, false);
  executor_->Prepare(sub_scope_, *inference_program_, 0,
                     config_.use_feed_fetch_ops_);
@@ -1197,6 +1236,9 @@ USE_TRT_CONVERTER(roi_align);
 USE_TRT_CONVERTER(affine_channel);
 USE_TRT_CONVERTER(multiclass_nms);
 USE_TRT_CONVERTER(nearest_interp);
+USE_TRT_CONVERTER(reshape);
+USE_TRT_CONVERTER(reduce_sum);
+USE_TRT_CONVERTER(gather_nd);
 #endif
 namespace paddle_infer {

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -12,6 +12,9 @@ nv_library(tensorrt_converter
                affine_channel_op.cc
                multiclass_nms_op.cc
                nearest_interp_op.cc
+                reshape_op.cc
+                reduce_op.cc
+                gather_nd_op.cc
           DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry)
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS

--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
                              static_cast<void*>(bias_data), bias_size};
-  auto* layer = fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output, n_input,
+  // In conv2d_transpose and depthwise_conv2d_transpose,
-                           nv_ksize, weight, bias);
+  // output channels = filter_dims[1] * groups
-  PADDLE_ENFORCE_NOT_NULL(layer,
+  auto* layer = (op_desc.Type() == "conv2d_transpose" ||
-                          platform::errors::Fatal("TensorRT create conv2d"
+                 op_desc.Type() == "depthwise_conv2d_transpose")
-                                                  " layer error."));
+                    ? fadd_layer(const_cast<nvinfer1::ITensor*>(X),
+                                 n_input * groups, nv_ksize, weight, bias)
+                    : fadd_layer(const_cast<nvinfer1::ITensor*>(X), n_output,
+                                 nv_ksize, weight, bias);
+  PADDLE_ENFORCE_NOT_NULL(
+      layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose"
+                                     " layer failed."));
  layer->setStride(nv_strides);
  layer->setPadding(nv_paddings);
  layer->setNbGroups(groups);
@@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter {
    ConvertConv2d(
        engine_, op, scope, test_mode,
        [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */
-            int n_input,                             /* Conv input maps */
            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
            TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* {
          auto* layer =
@@ -156,7 +162,6 @@ class Deconv2dOpConverter : public OpConverter {
    ConvertConv2d(
        engine_, op, scope, test_mode,
        [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */
-            int n_input,                             /* Deconv output maps */
            nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight,
            TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* {
          auto* layer =

--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -40,10 +40,19 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
    auto word_emb_name = op_desc.Input("WordEmbedding").front();
    auto pos_emb_name = op_desc.Input("PosEmbedding").front();
    auto sent_emb_name = op_desc.Input("SentEmbedding").front();
-    std::vector<std::string> id_names = {word_id_name, pos_id_name,
-                                         sent_id_name};
+    std::vector<std::string> id_names;
-    std::vector<std::string> emb_names = {word_emb_name, pos_emb_name,
+    std::vector<std::string> emb_names;
-                                          sent_emb_name};
+    if (engine_->use_oss()) {
+      id_names =
+          std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
+      emb_names =
+          std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
+    } else {
+      id_names = op_desc.Input("Ids");
+      emb_names = op_desc.Input("Embs");
+    }
    int input_num = id_names.size();

--- a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class GatherNdOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> inputs;
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* index = engine_->GetITensor(op_desc.Input("Index")[0]);
+    inputs.emplace_back(input);
+    inputs.emplace_back(index);
+    nvinfer1::ILayer* layer = nullptr;
+    bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+    plugin::GatherNdPluginDynamic* plugin =
+        new plugin::GatherNdPluginDynamic(with_fp16);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+    std::string layer_name = "gather_nd (Output: ";
+    auto output_name = op_desc.Output("Out")[0];
+    layer->getOutput(0)->setName(output_name.c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    layer_name += output_name;
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+    layer->setName((layer_name + ")").c_str());
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -127,6 +127,13 @@ class OpConverter {
          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                              op_desc.Type()));
    }
+    // reshape2 == reshape
+    if (op_desc.Type() == "reshape2") {
+      it = Registry<OpConverter>::Global().Lookup("reshape");
+      PADDLE_ENFORCE_NOT_NULL(
+          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+    }
    if (!it) {
      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
    }

--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <NvInfer.h>
+#include <sys/types.h>
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+class ReduceSumOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    framework::OpDesc op_desc(op, nullptr);
+    auto* x = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::Dims input_shape = x->getDimensions();
+    int input_dims = input_shape.nbDims;
+    bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim"));
+    std::vector<int32_t> dim =
+        BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
+    bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
+    // Now we only support dynamic_shape mode.
+    nvinfer1::IReduceLayer* layer = nullptr;
+    if (reduce_all) {
+      uint32_t reduce_dim = 0;
+      for (int i = 0; i < input_dims; ++i) {
+        reduce_dim |= 1 << i;
+      }
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+                                   keep_dim);
+    } else {
+      auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
+        uint32_t res = 0;
+        for (auto x : dims) {
+          if (x < 0) {
+            res |= 1 << (x + input_dims);
+          } else {
+            res |= 1 << x;
+          }
+        }
+        return res;
+      };
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
+                                   nvinfer1::ReduceOperation::kSUM,
+                                   CvtToBitMask(dim), keep_dim);
+    }
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * ReshapeOp
+ */
+class ReshapeOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
+    std::vector<int> shape =
+        BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("shape"));
+    int nbDims_num = shape.size();
+    nvinfer1::Dims reshape_dim;
+    if (engine_->with_dynamic_shape()) {  // running the TRT Dynamic Shape mode
+      reshape_dim.nbDims = nbDims_num;
+      for (int i = 0; i < nbDims_num; ++i) {
+        reshape_dim.d[i] = shape[i];
+      }
+    } else {  // running the TRT Static Shape mode
+      reshape_dim.nbDims = nbDims_num - 1;
+      for (int i = 0; i < nbDims_num - 1; ++i) {
+        reshape_dim.d[i] = shape[i + 1];
+      }
+    }
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+    layer->setReshapeDimensions(reshape_dim);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode);
+  }
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+REGISTER_TRT_OP_CONVERTER(reshape, ReshapeOpConverter);
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
@@ -49,6 +50,10 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7130)
    teller_set.insert("group_norm");
+#endif
+#if CUDA_VERSION >= 10200
+    teller_set.insert("reshape");
+    teller_set.insert("reshape2");
 #endif
  }
@@ -118,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller {
      "flatten2",
      "flatten",
      "gather",
+      "gather_nd",
      "yolo_box",
      "roi_align",
      "affine_channel",
      "nearest_interp",
      "anchor_generator",
+      "reduce_sum",
  };
 };
@@ -143,19 +150,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
          BOOST_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
      if (paddings.size() > 2) return false;
-// strides > 1 is only supported by trt7.0 above
-#if !IS_TRT_VERSION_GE(7000)
-      if (desc.HasAttr("strides")) {
-        const std::vector<int> strides =
-            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
-        // there is no issue if strides.size() less than 2
-        if (strides.size() > 1) {
-          for (size_t i = 0; i < strides.size(); i++) {
-            if (strides[i] > 1) return false;
-          }
-        }
-      }
-#endif
    }
    if (op_type == "pool2d") {
@@ -239,15 +233,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
        return false;
      }
-// strides > 1 is only supported by trt7.0 above
+// strides > 1 and 'SAME' is only supported by trt7.0 above
 #if !IS_TRT_VERSION_GE(7000)
-      if (desc.HasAttr("strides")) {
+      if (op_type == "conv2d" || op_type == "conv2d_fusion" ||
-        const std::vector<int> strides =
+          op_type == "depthwise_conv2d") {
-            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+        if (desc.HasAttr("padding_algorithm") && with_dynamic_shape) {
-        // there is no issue if strides.size() less than 2
+          auto padding_algorithm =
-        if (strides.size() > 1) {
+              BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm"));
-          for (size_t i = 0; i < strides.size(); i++) {
+          if (padding_algorithm == "SAME" && desc.HasAttr("strides")) {
-            if (strides[i] > 1) return false;
+            const std::vector<int> strides =
+                BOOST_GET_CONST(std::vector<int>, desc.GetAttr("strides"));
+            // there is no issue if strides.size() less than 2
+            if (strides.size() > 1) {
+              for (size_t i = 0; i < strides.size(); i++) {
+                if (strides[i] > 1) return false;
+              }
+            }
          }
        }
      }
@@ -326,6 +327,30 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
      if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
    }
+    if (op_type == "gather_nd") {
+      auto* block = desc.Block();
+      auto x_var_name = desc.Input("X")[0];
+      auto index_var_name = desc.Input("Index")[0];
+      auto* x_var_desc = block->FindVar(x_var_name);
+      auto* index_var_desc = block->FindVar(index_var_name);
+      // The index input must be int32 datatype.
+      if (index_var_desc->GetDataType() !=
+          paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+        VLOG(3) << "gather_nd op Index input data type must be int32";
+        return false;
+      }
+      const auto index_shape = index_var_desc->GetShape();
+      const auto x_shape = x_var_desc->GetShape();
+      if (x_shape.size() != index_shape.size()) {
+        VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size()
+                << " ] not equal to x dims size [" << x_shape.size() << "]";
+        return false;
+      }
+      if (!with_dynamic_shape) return false;
+    }
    if (op_type == "yolo_box") {
      if (with_dynamic_shape) return false;
      bool has_attrs =
@@ -673,6 +698,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
        return false;
      }
    }
+    if (op_type == "reshape" || op_type == "reshape2") {
+      if (!desc.HasAttr("shape")) {
+        return false;
+        // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
+      } else if (desc.Input("Shape").size() >= 1 ||
+                 desc.Input("ShapeTensor").size() >= 1) {
+        return false;
+      } else {
+        std::vector<int> shape =
+            BOOST_GET_CONST(std::vector<int>, desc.GetAttr("shape"));
+        if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false;
+      }
+    }
+    if (op_type == "reduce_sum") {
+      if (!with_dynamic_shape) {
+        VLOG(3) << "the reduce_sum does not support static shape yet";
+        return false;
+      }
+      if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
+            desc.HasAttr("reduce_all"))) {
+        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+                   "reduce_all)";
+        return false;
+      }
+    }
    if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
  }

--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -8,6 +8,7 @@ nv_library(tensorrt_plugin
           anchor_generator_op_plugin.cu
           yolo_box_op_plugin.cu
           roi_align_op_plugin.cu
+           gather_nd_op_plugin.cu
           DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
 nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS

--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <cuda_fp16.h>
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include "NvInferRuntimeCommon.h"
+#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+#include "paddle/fluid/platform/place.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+#if IS_TRT_VERSION_GE(6000)
+template <typename T, typename IndexT = int>
+__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
+                                   const IndexT* indices, T* output,
+                                   int32_t remain_size, int32_t slice_size,
+                                   int32_t end_size) {
+  CUDA_KERNEL_LOOP(i, remain_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    IndexT gather_i = 0;
+    int32_t temp = slice_size;
+    for (int32_t j = end_size - 1; j >= 0; --j) {
+      auto index_value = indices[indices_i * end_size + j];
+      PADDLE_ENFORCE(
+          index_value >= 0 && index_value < input_dims[j],
+          "The index is out of bounds, "
+          "please check whether the dimensions of index and "
+          "input meet the requirements. It should "
+          "be less than [%d] and greater or equal to 0, but received [%d]",
+          input_dims[j], index_value);
+      gather_i += (index_value * temp);
+      temp *= input_dims[j];
+    }
+    IndexT input_i = gather_i + slice_i;
+    *(output + i) = *(input + input_i);
+  }
+}
+int GatherNdPluginDynamic::initialize() { return 0; }
+size_t GatherNdPluginDynamic::getSerializationSize() const {
+  return SerializedSize(with_fp16_);
+}
+void GatherNdPluginDynamic::serialize(void* buffer) const {
+  SerializeValue(&buffer, with_fp16_);
+}
+nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs, 2,
+      platform::errors::InvalidArgument(
+          "The gather_nd plugin should have 2 input, but got %d.", nb_inputs));
+  PADDLE_ENFORCE_EQ(output_index, 0,
+                    platform::errors::InvalidArgument(
+                        "When GetOutputDimensions in gather_nd "
+                        "plugin, the output_index should be 0."));
+  nvinfer1::DimsExprs x_dims = inputs[0];
+  nvinfer1::DimsExprs index_dims = inputs[1];
+  int32_t x_dims_size = x_dims.nbDims;
+  int32_t index_dims_size = index_dims.nbDims;
+  // TODO(wilber): The result dims shoule be Index.shape[:-1] +
+  // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't
+  // get the actual value. So we only support one scenario: input_dims.size ==
+  // index_dims.size.
+  nvinfer1::DimsExprs ret(x_dims);
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    ret.d[i] = index_dims.d[i];
+  }
+  return ret;
+}
+bool GatherNdPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
+    int nb_outputs) {
+  PADDLE_ENFORCE_NOT_NULL(
+      in_out, platform::errors::InvalidArgument(
+                  "The input of gather_nd plugin should not be nullptr."));
+  PADDLE_ENFORCE_LT(
+      pos, nb_inputs + nb_outputs,
+      platform::errors::InvalidArgument("The pos(%d) should be less than the "
+                                        "num(%d) of the input and the output.",
+                                        pos, nb_inputs + nb_outputs));
+  (in_out && pos < (nb_inputs + nb_outputs));
+  const nvinfer1::PluginTensorDesc& in = in_out[pos];
+  if (pos == 0) {
+    if (with_fp16_) {
+      return (in.type == nvinfer1::DataType::kFLOAT ||
+              in.type == nvinfer1::DataType::kHALF) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
+  } else if (pos == 1) {
+    return in.type == nvinfer1::DataType::kINT32 &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  } else if (pos == 2) {
+    return in.type == in_out[0].type &&
+           in.format == nvinfer1::TensorFormat::kLINEAR;
+  }
+  return true;
+}
+nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  return input_types[0];
+}
+int GatherNdPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;
+  auto index_dims = input_desc[1].dims;
+  auto input_dims_size = input_dims.nbDims;
+  auto index_dims_size = index_dims.nbDims;
+  std::vector<int32_t> input_shape, index_shape, out_shape;
+  for (int i = 0; i < input_dims.nbDims; i++)
+    input_shape.push_back(input_dims.d[i]);
+  for (int i = 0; i < index_dims.nbDims; i++)
+    index_shape.push_back(index_dims.d[i]);
+  // The out_shape is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    out_shape.emplace_back(index_shape[i]);
+  }
+  for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) {
+    out_shape.emplace_back(input_shape[i]);
+  }
+  // final dim
+  int end_size = index_shape[index_dims_size - 1];
+  // remain dim
+  std::vector<int> remain_ddim(index_shape.begin(), index_shape.end() - 1);
+  int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1,
+                                     std::multiplies<int>());
+  // slice size
+  int slice_size = 1;
+  for (int i = end_size; i < input_dims_size; ++i) {
+    slice_size *= input_shape[i];
+  }
+  auto input_type = input_desc[0].type;
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32";
+    const float* p_input = static_cast<const float*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    float* p_output = static_cast<float*>(outputs[0]);
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+    GatherNdCUDAKernel<float, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16";
+    const half* p_input = static_cast<const half*>(inputs[0]);
+    const int32_t* p_index = static_cast<const int32_t*>(inputs[1]);
+    half* p_output = static_cast<half*>(outputs[0]);
+    if (input_dims_data_ == nullptr) {
+      cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int));
+    }
+    cudaMemcpyAsync(input_dims_data_, input_shape.data(),
+                    sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice,
+                    stream);
+    int block = 512;
+    int n = slice_size * remain_numel;
+    int grid = (n + block - 1) / block;
+    GatherNdCUDAKernel<half, int32_t><<<grid, block, 0, stream>>>(
+        p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size,
+        end_size);
+  }
+  return cudaGetLastError() != cudaSuccess;
+}
+#endif
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <thrust/device_vector.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+#if IS_TRT_VERSION_GE(6000)
+class GatherNdPluginDynamic : public DynamicPluginTensorRT {
+ public:
+  explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; }
+  GatherNdPluginDynamic(void const* serial_data, size_t serial_length) {
+    DeserializeValue(&serial_data, &serial_length, &with_fp16_);
+  }
+  nvinfer1::IPluginV2DynamicExt* clone() const override {
+    return new GatherNdPluginDynamic(with_fp16_);
+  }
+  const char* getPluginType() const override { return "gather_nd_plugin"; }
+  int getNbOutputs() const override { return 1; }
+  int initialize() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override {}
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override {
+    return 0;
+  }
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+  void destroy() override {
+    if (input_dims_data_) {
+      cudaFree(input_dims_data_);
+    }
+    delete this;
+  }
+ private:
+  int32_t* input_dims_data_{nullptr};
+};
+class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GatherNdPluginDynamicCreator() {}
+  const char* getPluginName() const override { return "gather_nd_plugin"; }
+  const char* getPluginVersion() const override { return "1"; }
+  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+    return &field_collection_;
+  }
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+    return nullptr;
+  }
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override {
+    auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
+    return plugin;
+  }
+  void setPluginNamespace(const char* lib_namespace) override {
+    plugin_namespace_ = lib_namespace;
+  }
+  const char* getPluginNamespace() const override {
+    return plugin_namespace_.c_str();
+  }
+ private:
+  std::string plugin_namespace_;
+  std::string plugin_name_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator);
+#endif
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -23,7 +23,30 @@ function(inference_download INSTALL_DIR URL FILENAME)
  )
 endfunction()
-function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
+function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
+  string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+      ${EXTERNAL_PROJECT_NAME}
+      ${EXTERNAL_PROJECT_LOG_ARGS}
+      PREFIX                ${INSTALL_DIR}
+      URL                   ${URL}/${FILENAME}
+      URL_HASH              MD5=${CHECK_SUM}
+      DOWNLOAD_DIR          ${INSTALL_DIR}
+      DOWNLOAD_NO_EXTRACT   1
+      DOWNLOAD_NO_PROGRESS  1
+      CONFIGURE_COMMAND     ""
+      BUILD_COMMAND         ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR}
+                            ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME}
+      UPDATE_COMMAND        ""
+      INSTALL_COMMAND       ""
+  )
+endfunction()
+function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME)
  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
  string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME})
  string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME})
@@ -47,13 +70,13 @@ endfunction()
 set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec")
 if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
-  inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
+  inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz")
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
 if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz)
-  inference_download_and_uncompress(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
+  inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
 endif()
 set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -115,9 +115,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function)
 if (WITH_GPU OR WITH_ROCM)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
@@ -171,7 +171,7 @@ endif()
 if (WITH_ASCEND_CL)
  cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor)
-  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op)
+  cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op)
 endif()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")

--- a/paddle/fluid/operators/abs_op.cu
+++ b/paddle/fluid/operators/abs_op.cu
@@ -14,8 +14,6 @@
 #include "paddle/fluid/operators/abs_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -789,6 +789,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel {
  }
 };
+template <typename T>
+class SigmoidDoubleGradMaker
+    : public ::paddle::framework::SingleGradOpMaker<T> {
+ public:
+  using ::paddle::framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("sigmoid_grad_grad");
+    // input1: Out
+    op->SetInput("Out", this->Input("Out"));
+    // input2: ddx
+    op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
+    op->SetInput("DOut", this->Input(framework::GradVarName("Out")));
+    op->SetAttrMap(this->Attrs());
+    // output: ddy
+    op->SetOutput("DOutNew", this->InputGrad("Out"));
+    op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out")));
+  }
+};
 template <typename T>
 class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
 public:
@@ -1068,6 +1089,47 @@ namespace plat = paddle::platform;
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
+/* ==========================    sigmoid register  =============================
+ */
+// 1. Register Sigmoid Operator
+REGISTER_OPERATOR(
+    sigmoid, ops::ActivationOp, ops::SigmoidOpMaker,
+    ops::ActivationOpInferVarType,
+    ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
+                               paddle::framework::OpDesc>,
+    ops::ActivationGradOpMaker<ops::SigmoidGradFunctor<float>::FwdDeps(),
+                               paddle::imperative::OpBase>,
+    std::conditional<ops::CanInplaceAct<ops::SigmoidGradFunctor<float>>(),
+                     ops::ActFwdInplaceInferer, void>::type);
+// 2. Register Sigmoid Grad Operator
+REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad,
+                  ops::ActivationGradOpInplaceInferer,
+                  ops::SigmoidDoubleGradMaker<paddle::framework::OpDesc>,
+                  ops::SigmoidDoubleGradMaker<paddle::imperative::OpBase>)
+// 3. Register Sigmoid DoubleGrad Operator
+REGISTER_OPERATOR(
+    sigmoid_grad_grad,
+    ops::ActivationOpDoubleGrad<ops::SigmoidGradFunctor<float>::FwdDeps()>,
+    ops::ActivationDoubleGradOpInplaceInferer);
+// Register Sigmoid/GradSigmoid Kernels
+REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor,
+                               SigmoidGradFunctor);
+// Register DoubleGrad Kernel
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad_grad,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<float>>,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<double>>,
+    ops::SigmoidDoubleGradKernel<plat::CPUDeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
 /* ==========================    tanh register  ============================= */
 REGISTER_OPERATOR(
    tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType,

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -1481,6 +1481,21 @@ REGISTER_OP_CUDA_KERNEL(
 #endif
 /* ========================================================================== */
+/* ===========================    sigmoid register  ============================
+ */
+REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor,
+                                CudaSigmoidGradFunctor);
+REGISTER_OP_CUDA_KERNEL(
+    sigmoid_grad_grad,
+    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<float>>,
+    ops::SigmoidDoubleGradKernel<paddle::platform::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<double>>,
+    ops::SigmoidDoubleGradKernel<plat::CUDADeviceContext,
+                                 ops::SigmoidGradGradFunctor<plat::float16>>);
+/* ========================================================================== */
 /* ===========================    tanh register  ============================ */
 REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor,
                                CudaTanhGradFunctor);
@@ -1595,7 +1610,6 @@ REGISTER_OP_CUDA_KERNEL(
 /* ========================================================================== */
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
-  __macro(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor);      \
  __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor);                  \
  __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor,                      \
          CudaLogSigmoidGradFunctor);                                         \

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
+/*
+    Out
+    DOut -> SigmoidGradGrad -> DOutNew
+    DDX                        DDOut
+    DDOut = (1-Out)*Out*DDX
+    DOutNew = (1-2*Out)*DOut*DDX
+*/
+template <typename T>
+struct SigmoidGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev, const framework::Tensor* Out,
+                  const framework::Tensor* ddX, const framework::Tensor* dOut,
+                  framework::Tensor* dOutNew, framework::Tensor* ddOut) const {
+    auto* d = dev.eigen_device();
+    auto ddx = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad"));
+    auto out = framework::EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad"));
+    if (dOutNew) {
+      auto dout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad"));
+      auto dout_new = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad"));
+      dout_new.device(*d) =
+          (static_cast<T>(1) - static_cast<T>(2) * out) * dout * ddx;
+    }
+    if (ddOut) {
+      auto ddout = framework::EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad"));
+      ddout.device(*d) = (static_cast<T>(1) - out) * out * ddx;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
 // silu(x) = x / (1 + exp(-x))
 template <typename T>
 struct SiluFunctor : public BaseActivationFunctor<T> {
@@ -1789,6 +1826,50 @@ inline void ExtractDoubleGradTensorWithInputDOut(
  }
 }
+template <typename DeviceContext, typename Functor>
+class SigmoidDoubleGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::Tensor *Out, *ddX, *dOut;
+    framework::Tensor *dOutNew, *ddOut;
+    Out = ddX = dOut = nullptr;
+    dOutNew = ddOut = nullptr;
+    // extract ddx(input) and out(input)
+    ddX = ctx.Input<framework::Tensor>("DDX");
+    Out = ctx.Input<framework::Tensor>("Out");
+    PADDLE_ENFORCE_NOT_NULL(
+        ddX, platform::errors::NotFound(
+                 "Cannot get input Variable ddX, variable name = %s",
+                 ctx.InputName("DDX")));
+    PADDLE_ENFORCE_NOT_NULL(
+        Out, platform::errors::NotFound(
+                 "Cannot get input Variable Out, variable name = %s",
+                 ctx.InputName("Out")));
+    // set output ddout
+    ddOut = ctx.Output<framework::Tensor>("DDOut");
+    // extract dOut(intput)
+    dOut = ctx.Input<framework::Tensor>("DOut");
+    PADDLE_ENFORCE_NOT_NULL(
+        dOut, platform::errors::NotFound(
+                  "Cannot get input Variable dOut, variable name = %s",
+                  ctx.InputName("DOut")));
+    // set output dout_new
+    dOutNew = ctx.Output<framework::Tensor>("DOutNew");
+    if (dOutNew) dOutNew->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    if (ddOut) ddOut->mutable_data<T>(Out->dims(), ctx.GetPlace());
+    auto& place = ctx.template device_context<DeviceContext>();
+    Functor functor;
+    functor(place, Out, ddX, dOut, dOutNew, ddOut);
+  }
+};
 template <typename DeviceContext, typename Functor>
 class TanhDoubleGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -2153,7 +2234,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
 }  // namespace paddle
 #define FOR_EACH_ACTIVATION_OP(__macro)                                       \
-  __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
  __macro(silu, Silu, SiluFunctor, SiluGradFunctor);                          \
  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \

--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("Power", {*x}, {*out},
+    const auto& runner = NpuOpRunner("Power", {*x}, {*out},
-                              {{"power", factor},
+                                     {{"power", factor},
-                               {"scale", static_cast<float>(1.0)},
+                                      {"scale", static_cast<float>(1.0)},
-                               {"shift", static_cast<float>(0.0)}});
+                                      {"shift", static_cast<float>(0.0)}});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
    // Step1: Compute x_pow = x.pow(factor-1)
    Tensor x_pow(x->type());
    x_pow.mutable_data<T>(x->dims(), place);
-    auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow},
+    const auto& runner_pow = NpuOpRunner(
-                                  {{"power", factor - static_cast<float>(1)}});
+        "Power", {*x}, {x_pow}, {{"power", factor - static_cast<float>(1)}});
    runner_pow.Run(stream);
    // Step 2: Construct a broadcast factor, which has the same shape with x.
@@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel<T> {
    // factor.
    Tensor factor_bc_tensor(framework::proto::VarType::FP32);
    factor_bc_tensor.mutable_data<float>(x_dims, place);
-    auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+    const auto& runner_bc =
-                                 {{"dims", framework::vectorize(x_dims)}});
+        NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor},
+                    {{"dims", framework::vectorize(x_dims)}});
    runner_bc.Run(stream);
    // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1)
    Tensor x_power_mul_factor(x->type());
    x_power_mul_factor.mutable_data<T>(x->dims(), place);
-    auto runner_mul_1 =
+    const auto& runner_mul_1 =
        NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {});
    runner_mul_1.Run(stream);
    // Step 4: Compute dx = dout * factor * x.pow(factor-1)
    dx->mutable_data<T>(place);
-    auto runner_mul_2 =
+    const auto& runner_mul_2 =
        NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {});
    runner_mul_2.Run(stream);
  }
@@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel<T> {
    out->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("Relu",
+    const auto& runner = NpuOpRunner("Relu",
-                              {
+                                     {
-                                  *x,
+                                         *x,
-                              },
+                                     },
-                              {*out}, {});
+                                     {*out}, {});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
@@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel<T> {
            .stream();
    dx->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
+    const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {});
    runner.Run(stream);
  }
@@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {});
    runner.Run(stream);
  }
 };
@@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
+    const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {});
-    dx_runner.Run(stream);
+    runner_dx.Run(stream);
  }
 };
@@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel<T> {
    Tensor one(x->type());
    one.mutable_data<T>(x->dims(), place);
-    auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {});
+    const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {});
-    one_runner.Run(stream);
+    runner_one.Run(stream);
    Tensor sub(x->type());
    sub.mutable_data<T>(x->dims(), place);
-    auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {});
+    const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {});
-    sub_runner.Run(stream);
+    runner_sub.Run(stream);
-    auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {});
+    const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {});
-    out_runner.Run(stream);
+    runner_out.Run(stream);
  }
 };
@@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel<T> {
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
+    const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {});
    runner.Run(stream);
  }
 };
@@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {});
    runner.Run(stream);
  }
 };
@@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
+    const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {});
-    dx_runner.Run(stream);
+    runner_dx.Run(stream);
  }
 };
@@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("Square", {*x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {});
    runner.Run(stream);
  }
 };

--- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
+++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc
@@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel<T> {
    auto* float_status = ctx.Output<framework::Tensor>("FloatStatus");
    float_status->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
+    const auto& runner =
+        NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();

--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc
@@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
    found_inf->mutable_data<bool>(ctx.GetPlace());
-    bool found_inf_data = false;
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    // step1: inverse scale(RealDiv)
+    // step1: inverse scale
    Tensor const_tensor;
    const_tensor.mutable_data<T>({1}, ctx.GetPlace());
    FillNpuTensorWithConstant<T>(&const_tensor, static_cast<T>(1.0));
@@ -58,7 +56,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
    Tensor inverse_out(scale->type());
    inverse_out.Resize(scale->dims());
    inverse_out.mutable_data<T>(ctx.GetPlace());
-    auto runner_inverse =
+    const auto& runner_inverse =
        NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {});
    runner_inverse.Run(stream);
    tmp_inverse_out = &inverse_out;
@@ -66,55 +64,41 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel<T> {
    // NOTE(zhiqiu):
    Tensor tmp;
    tmp.mutable_data<float>({8}, ctx.GetPlace());
    // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place.
    // tmp is only placeholder.
-    auto runner_float_status =
+    const auto& runner_float_status =
        NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp},
                    {{"message", std::string("check_nan_and_inf")}});
    runner_float_status.Run(stream);
    Tensor sum;
    sum.mutable_data<float>({1}, ctx.GetPlace());
-    auto runner_reduce_sum =
+    const auto& runner_reduce_sum =
        NpuOpRunner("ReduceSumD", {*float_status}, {sum},
                    {{"axes", std::vector<int>{0}}, {"keep_dims", true}});
    runner_reduce_sum.Run(stream);
-    std::vector<float> sum_vec;
+    const auto& runner_greater =
-    TensorToVector(
+        NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {});
-        sum, ctx.template device_context<paddle::platform::NPUDeviceContext>(),
+    runner_greater.Run(stream);
-        &sum_vec);
-    found_inf_data = (sum_vec[0] > 1);
+    // NOTE(zhiqiu): The normal logic is :
+    // out = in, if found_inf = true
-    VLOG(4) << "found_inf_data:" << found_inf_data;
+    // out = in/scale, if found_inf = false
+    // However, on NPU, in order to avoid stream sync, we do not copy the
+    // found_inf data to cpu to check whether to unscale or not.
+    // Instead, we do the Mul no matter found_inf or not.
+    // And, a fact is, only few steps contains nan/inf during training.
    for (size_t i = 0; i < xs.size(); ++i) {
      const auto* x = xs[i];
      auto* out = outs[i];
      out->mutable_data<T>(ctx.GetPlace());
-      if (!found_inf_data) {
+      const auto& runner_mul =
-        // MatMul
+          NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-        auto runner_matmul =
+      runner_mul.Run(stream);
-            NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {});
-        runner_matmul.Run(stream);
-      }
    }
-    // set found_inf to true
+    const auto& runner_clear_status =
-    VLOG(4) << "found overflow:" << found_inf_data;
-    Tensor found_inf_tensor;
-    found_inf_tensor.Resize({1});
-    bool* is_found_inf =
-        found_inf_tensor.mutable_data<bool>(paddle::platform::CPUPlace());
-    *is_found_inf = found_inf_data;
-    framework::TensorCopy(
-        found_inf_tensor, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), found_inf);
-    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
-    auto runner_clear_status =
        NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp});
    runner_clear_status.Run(stream);
  }

--- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
+++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc
@@ -43,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx,
    Tensor factor_tensor(bad_out_tensor->type());
    factor_tensor.mutable_data<int>({1}, place);
    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
+    const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor},
-                                 {*bad_out_tensor}, {});
+                                        {*bad_out_tensor}, {});
    runner_p2.Run(stream);
    std::vector<int> bad_out_data;
    TensorToVector(*bad_out_tensor, ctx, &bad_out_data);
    if (bad_out_data[0] == decr_every_n_nan_or_inf) {
-      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+      const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                   {*updated_loss_scaling_tensor},
+                                          {*updated_loss_scaling_tensor},
-                                   {{"power", static_cast<float>(1)},
+                                          {{"power", static_cast<float>(1)},
-                                    {"scale", decr_ratio},
+                                           {"scale", decr_ratio},
-                                    {"shift", static_cast<float>(0)}});
+                                           {"shift", static_cast<float>(0)}});
      runner_p3.Run(stream);
@@ -62,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx,
      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
      if (new_loss_scaling[0] < static_cast<T>(1)) {
        // updated_loss_scaling_data = 1
-        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+        const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                     {*updated_loss_scaling_tensor},
+                                            {*updated_loss_scaling_tensor},
-                                     {{"power", static_cast<float>(1)},
+                                            {{"power", static_cast<float>(1)},
-                                      {"scale", static_cast<float>(0)},
+                                             {"scale", static_cast<float>(0)},
-                                      {"shift", static_cast<float>(1)}});
+                                             {"shift", static_cast<float>(1)}});
        runner_p4.Run(stream);
      }
@@ -86,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx,
    Tensor factor_tensor(good_out_tensor->type());
    factor_tensor.mutable_data<int>({1}, place);
    FillNpuTensorWithConstant<int>(&factor_tensor, static_cast<int>(1));
-    auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
+    const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor},
-                                 {*good_out_tensor}, {});
+                                        {*good_out_tensor}, {});
    runner_p2.Run(stream);
    std::vector<int> good_out_data;
    TensorToVector(*good_out_tensor, ctx, &good_out_data);
    if (good_out_data[0] == incr_every_n_steps) {
-      auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+      const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                   {*updated_loss_scaling_tensor},
+                                          {*updated_loss_scaling_tensor},
-                                   {{"power", static_cast<float>(1)},
+                                          {{"power", static_cast<float>(1)},
-                                    {"scale", incr_ratio},
+                                           {"scale", incr_ratio},
-                                    {"shift", static_cast<float>(0)}});
+                                           {"shift", static_cast<float>(0)}});
      runner_p3.Run(stream);
      std::vector<T> new_loss_scaling;
      TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling);
      if (!std::isfinite(new_loss_scaling[0])) {
        // updated_loss_scaling_data = pre_loss_scaling_data
-        auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
+        const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor},
-                                     {*updated_loss_scaling_tensor},
+                                            {*updated_loss_scaling_tensor},
-                                     {{"power", static_cast<float>(1)},
+                                            {{"power", static_cast<float>(1)},
-                                      {"scale", static_cast<float>(1)},
+                                             {"scale", static_cast<float>(1)},
-                                      {"shift", static_cast<float>(0)}});
+                                             {"shift", static_cast<float>(0)}});
        runner_p4.Run(stream);
      }
@@ -165,7 +165,7 @@ class LazyZerosNPU {
      }
      zero_tensor->mutable_data<T>(place);
-      auto runner_zeros =
+      const auto& runner_zeros =
          NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor});
      runner_zeros.Run(stream);
      zero_tensor->check_memory_size();

--- a/paddle/fluid/operators/assign_op_npu.cc
+++ b/paddle/fluid/operators/assign_op_npu.cc
@@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel<T> {
    auto* out = ctx.Output<framework::LoDTensor>("Out");
    out->mutable_data<T>(ctx.GetPlace());
-    auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
+    const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {});
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();

--- a/paddle/fluid/operators/benchmark/CMakeLists.txt
+++ b/paddle/fluid/operators/benchmark/CMakeLists.txt
 cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
        DEPS memory timer framework_proto proto_desc lod_tensor op_registry
-        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
+        device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function)
--- a/paddle/fluid/operators/cast_op_npu.cc
+++ b/paddle/fluid/operators/cast_op_npu.cc
@@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    auto runner = NpuOpRunner("Cast", {*x}, {*out},
+    const auto& runner = NpuOpRunner(
-                              {{"dst_type", static_cast<int32_t>(aclDtype)}});
+        "Cast", {*x}, {*out}, {{"dst_type", static_cast<int32_t>(aclDtype)}});
    runner.Run(stream);
  }
 };

--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+namespace paddle {
+namespace operators {
+class CEmbeddingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp");
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+    int ids_rank = ids_dims.size();
+    VLOG(5) << "ids rank is " << ids_rank << std::endl;
+    PADDLE_ENFORCE_EQ(
+        table_dims.size(), 2,
+        platform::errors::InvalidArgument(
+            "ShapeError: The dimensions of the 'c_embedding' must be 2. "
+            "But received c_embedding's dimensions = %d, "
+            "c_embedding's shape = [%s].",
+            table_dims.size(), table_dims));
+    auto output_dims = framework::vectorize(ids_dims);
+    output_dims.push_back(table_dims[1]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+    if (ctx->GetOutputsVarType("Out")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
+      ctx->ShareLoD("Ids", /*->*/ "Out");
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W");
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("W",
+             "(Tensor) The input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int64 "
+             "contains the ids to be looked up in W.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<int64_t>("start_index",
+                     "(int64, default 0), The starting index is indeed, "
+                     "and the out-of-bounds will be set to 0 ")
+        .SetDefault(0);
+    AddComment(R"DOC(
+c_embedding Operator.
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+)DOC");
+  }
+};
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(CEmbeddingGradOpNoBufferVarsInferer, "W");
+template <typename T>
+class CEmbeddingGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("c_embedding_grad");
+    op->SetInput("W", this->Input("W"));
+    op->SetInput("Ids", this->Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+class CEmbeddingOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = OperatorWithKernel::IndicateVarDataType(
+        ctx, framework::GradVarName("Out"));
+    return framework::OpKernelType(data_type, ctx.device_context());
+  }
+};
+class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = framework::GradVarName("W");
+    VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W")
+            << " is set to LoDTensor";
+    ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker,
+                  ops::CEmbeddingGradOpMaker<paddle::framework::OpDesc>,
+                  ops::CEmbeddingGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad,
+                  ops::CEmbeddingGradOpNoBufferVarsInferer,
+                  ops::CEmbeddingOpGradVarTypeInference);
+REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel<float>,
+                       ops::CEmbeddingOpCPUKernel<double>);
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+namespace paddle {
+namespace operators {
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+template <typename T, typename IndexT>
+__global__ void CEmbedding(T *out, const T *table, const IndexT *ids,
+                           const int rows, const int columns, const int64_t N,
+                           const int64_t start_idx, const int64_t end_idx,
+                           const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      PADDLE_ENFORCE(real_idx < N,
+                     "The index is out of bounds, "
+                     "please check whether the dimensions of index and "
+                     "input meet the requirements. It should "
+                     "be less than [%d], but received [%d]",
+                     N, real_idx);
+      out[i] = table[real_idx * columns + col];
+    } else {
+      out[i] = static_cast<T>(0);
+    }
+  }
+}
+template <typename T, typename IndexT>
+__global__ void CEmbeddingGrad(T *table, const T *output, const IndexT *ids,
+                               const int rows, const int columns,
+                               const int64_t N, const int64_t start_idx,
+                               const int64_t end_idx, const int64_t limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    size_t row = i / columns;
+    size_t col = i % columns;
+    auto id = ids[row];
+    if (id >= start_idx && id < end_idx) {
+      auto real_idx = id - start_idx;
+      paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
+                                      output[i]);
+    }
+  }
+}
+template <typename T>
+class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *table_t = context.Input<LoDTensor>("W");
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+    auto *output_t = context.Output<LoDTensor>("Out");
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = ids_t->numel();
+    const int64_t end_idx = start_idx + N;
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>(context.GetPlace());
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbedding<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int32_t>(), K, D, N, start_idx, end_idx,
+          limit);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbedding<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          output, table, ids_t->data<int64_t>(), K, D, N, start_idx, end_idx,
+          limit);
+    }
+  }
+};
+template <typename T>
+class CEmbeddingGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const auto &dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    auto ids_t = context.Input<LoDTensor>("Ids");
+    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+    int N = d_table_t->dims()[0];
+    int D = d_table_t->dims()[1];
+    int K = ids_t->numel();
+    const int64_t end_idx = start_idx + N;
+    auto limit = K * D;
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    const T *d_output = d_output_t->data<T>();
+    T *d_table = d_table_t->mutable_data<T>(context.GetPlace());
+    auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+    t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      CEmbeddingGrad<T, int32_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int32_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      CEmbeddingGrad<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids_t->data<int64_t>(), K, D, N, start_idx,
+          end_idx, limit);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(c_embedding, ops::CEmbeddingCUDAKernel<float>,
+                        ops::CEmbeddingCUDAKernel<double>,
+                        ops::CEmbeddingCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(c_embedding_grad, ops::CEmbeddingGradCUDAKernel<float>,
+                        ops::CEmbeddingGradCUDAKernel<double>,
+                        ops::CEmbeddingGradCUDAKernel<plat::float16>);
--- a/paddle/fluid/operators/sign_op.cu
+++ b/paddle/fluid/operators/sign_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,12 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/sign_op.h"
+#pragma once
-#include "paddle/fluid/platform/float16.h"
-REGISTER_OP_CUDA_KERNEL(
+#include <algorithm>
-    sign,
+#include <utility>
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
+#include <vector>
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
+#include "paddle/fluid/framework/data_type.h"
-                                  paddle::platform::float16>);
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+template <typename T>
+class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support c_embedding for cpu kernel now."));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/c_split_op.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cc
@@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel {
                          rank, nranks));
    framework::DDim dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(
+        dim[dim.size() - 1] % nranks, 0,
+        platform::errors::InvalidArgument("The last dimension (%d) of the X "
+                                          "should be divisible by nranks (%d)",
+                                          dim[dim.size() - 1], nranks));
    dim[dim.size() - 1] = dim[dim.size() - 1] / nranks;
    if (dim[0] < 0) dim[0] = -1;
    ctx->SetOutputDim("Out", dim);

--- a/paddle/fluid/operators/collective/c_split_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_split_op.cu.cc
@@ -16,10 +16,38 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
 namespace paddle {
 namespace operators {
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+template <typename T>
+__global__ void SplitFromRank(const T* input, T* output, const int rows,
+                              const int columns, const int rank,
+                              const int nranks, const int limit) {
+  CUDA_KERNEL_LOOP(i, limit) {
+    int row = i / columns;
+    int col = i % columns;
+    int block = columns / nranks;
+    int start = block * rank;
+    int end = start + block;
+    if (col >= start && col < end) {
+      int idx = block * row + col % block;
+      output[idx] = input[i];
+    }
+  }
+}
 template <typename T>
 class CSplitOpCUDAKernel : public framework::OpKernel<T> {
 public:
@@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel<T> {
                          rank, nranks));
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    std::vector<const framework::Tensor*> shape_refer;
-    std::vector<framework::Tensor*> results;
-    size_t numel = x->numel();
    auto dims = x->dims();
-    numel /= nranks;
+    auto dims_size = dims.size();
-    int axis = dims.size() - 1;
+    // final dim
-    dims[dims.size() - 1] /= nranks;
+    int64_t end_size = dims[dims_size - 1];
-    for (int i = 0; i < nranks; i++) {
-      framework::Tensor* out = new framework::Tensor();
-      out->mutable_data<T>(dims, place);
-      shape_refer.emplace_back(out);
-      results.emplace_back(out);
-    }
-    math::SplitFunctor<platform::CUDADeviceContext, T> functor;
+    // remain dim
-    functor(dev_ctx, *x, shape_refer, axis, &results);
+    auto remain_ddim = framework::slice_ddim(dims, 0, dims_size - 1);
+    int64_t remain_numel = framework::product(remain_ddim);
+    int limit = x->numel();
+    int blocks = NumBlocks(limit);
+    int threads = kNumCUDAThreads;
+    dims[dims_size - 1] /= nranks;
    out->mutable_data<T>(dims, place);
-    paddle::framework::TensorCopySync(*results[rank], out->place(), out);
+    SplitFromRank<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
+        x->data<T>(), out->data<T>(), remain_numel, end_size, rank, nranks,
+        limit);
  }
 };
 }  // namespace operators

--- a/paddle/fluid/operators/compat/batch_norm.pbtxt
+++ b/paddle/fluid/operators/compat/batch_norm.pbtxt
+type: "batch_norm"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Scale"
+  }
+  inputs {
+    name: "Bias"
+  }
+  inputs {
+    name: "Mean"
+  }
+  inputs {
+    name: "Variance"
+  }
+  outputs {
+    name: "Y"
+  }
+  attrs {
+    name: "epsilon"
+    type: FLOAT
+  }
+}
+extra {
+  inputs {
+    name: "MomentumTensor"
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "momentum"
+    type: FLOAT
+  }
+  attrs {
+    name: "data_layout"
+    type: STRING
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_with_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_global_stats"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "trainable_statistics"
+    type: BOOLEAN
+  }
+  outputs {
+    name: "MeanOut"
+  }
+  outputs {
+    name: "VarianceOut"
+  }
+  outputs {
+    name: "SavedMean"
+  }
+  outputs {
+    name: "SavedVariance"
+  }
+  outputs {
+    name: "ReserveSpace"
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/concat.pbtxt
+++ b/paddle/fluid/operators/compat/concat.pbtxt
+type: "concat"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "AxisTensor"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/conv2d.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d.pbtxt
+type: "conv2d"
+def {
+  inputs {
+    name: "Input"
+  }
+  inputs {
+    name: "Filter"
+  }
+  inputs {
+    name: "Bias"
+  }
+  outputs {
+    name: "Output"
+  }
+  attrs {
+    name: "strides"
+    type: INTS
+  }
+  attrs {
+    name: "paddings"
+    type: INTS
+  }
+  attrs {
+    name: "padding_algorithm"
+    type: STRING
+  }
+  attrs {
+    name: "groups"
+    type: INT
+  }
+  attrs {
+    name: "dilations"
+    type: INTS
+  }
+}
+extra {
+  inputs {
+    name: "ResidualData"
+  }
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_cudnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_relu_before_depthwise_conv"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_mkldnn"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "use_quantizer"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "mkldnn_data_type"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_relu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_brelu"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_brelu_threshold"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
+  attrs {
+    name: "use_addto"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "fuse_residual_connection"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "Scale_in"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_in_eltwise"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_weights"
+    type: FLOATS
+  }
+  attrs {
+    name: "force_fp32_output"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "data_format"
+    type: STRING
+  }
+  attrs {
+    name: "workspace_size_MB"
+    type: INT
+  }
+  attrs {
+    name: "exhaustive_search"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
+++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt
--- a/paddle/fluid/operators/compat/elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
--- a/paddle/fluid/operators/compat/elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
--- a/paddle/fluid/operators/compat/elementwise_pow.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
--- a/paddle/fluid/operators/compat/elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
--- a/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
+++ b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt
+type: "fake_channel_wise_quantize_abs_max"
+def {
+  inputs {
+    name: "X"
+  }
+  outputs {
+    name: "Out"
+  }
+  outputs {
+    name: "OutScale"
+  }
+  attrs {
+    name: "quant_axis"
+    type: INT
+  }
+  attrs {
+    name: "bit_length"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "is_test"
+    type: BOOLEAN
+  }
+  attrs {
+    name: "op_role"
+    type: INT
+  }
+  attrs {
+    name: "op_role_var"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_namescope"
+    type: STRING
+  }
+  attrs {
+    name: "op_callstack"
+    type: STRINGS
+  }
+  attrs {
+    name: "op_device"
+    type: STRING
+  }
+}
--- a/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
+++ b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt
--- a/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
+++ b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt
--- a/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
+++ b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt
--- a/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
+++ b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt
--- a/paddle/fluid/operators/compat/fc.pbtxt
+++ b/paddle/fluid/operators/compat/fc.pbtxt
--- a/paddle/fluid/operators/compat/fill_constant.pbtxt
+++ b/paddle/fluid/operators/compat/fill_constant.pbtxt
--- a/paddle/fluid/operators/compat/hard_swish.pbtxt
+++ b/paddle/fluid/operators/compat/hard_swish.pbtxt
--- a/paddle/fluid/operators/compat/leaky_relu.pbtxt
+++ b/paddle/fluid/operators/compat/leaky_relu.pbtxt
--- a/paddle/fluid/operators/compat/mul.pbtxt
+++ b/paddle/fluid/operators/compat/mul.pbtxt
--- a/paddle/fluid/operators/compat/reduce_mean.pbtxt
+++ b/paddle/fluid/operators/compat/reduce_mean.pbtxt
--- a/paddle/fluid/operators/compat/relu.pbtxt
+++ b/paddle/fluid/operators/compat/relu.pbtxt
--- a/paddle/fluid/operators/compat/relu6.pbtxt
+++ b/paddle/fluid/operators/compat/relu6.pbtxt
--- a/paddle/fluid/operators/compat/seqconv.pbtxt
+++ b/paddle/fluid/operators/compat/seqconv.pbtxt
--- a/paddle/fluid/operators/compat/sequence_expand.pbtxt
+++ b/paddle/fluid/operators/compat/sequence_expand.pbtxt
--- a/paddle/fluid/operators/compat/sigmoid.pbtxt
+++ b/paddle/fluid/operators/compat/sigmoid.pbtxt
--- a/paddle/fluid/operators/compat/sqrt.pbtxt
+++ b/paddle/fluid/operators/compat/sqrt.pbtxt
--- a/paddle/fluid/operators/compat/swish.pbtxt
+++ b/paddle/fluid/operators/compat/swish.pbtxt
--- a/paddle/fluid/operators/compat/tanh.pbtxt
+++ b/paddle/fluid/operators/compat/tanh.pbtxt
--- a/paddle/fluid/operators/concat_op_npu.cc
+++ b/paddle/fluid/operators/concat_op_npu.cc
--- a/paddle/fluid/operators/controlflow/compare_op.cu
+++ b/paddle/fluid/operators/controlflow/compare_op.cu
--- a/paddle/fluid/operators/controlflow/compare_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.cc
+++ b/paddle/fluid/operators/conv_transpose_op.cc
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/crop_op.cc
+++ b/paddle/fluid/operators/crop_op.cc
--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
--- a/paddle/fluid/operators/crop_op.h
+++ b/paddle/fluid/operators/crop_op.h
--- a/paddle/fluid/operators/crop_tensor_op.cc
+++ b/paddle/fluid/operators/crop_tensor_op.cc
--- a/paddle/fluid/operators/crop_tensor_op.cu
+++ b/paddle/fluid/operators/crop_tensor_op.cu
--- a/paddle/fluid/operators/crop_tensor_op.h
+++ b/paddle/fluid/operators/crop_tensor_op.h
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
--- a/paddle/fluid/operators/eigen/CMakeLists.txt
+++ b/paddle/fluid/operators/eigen/CMakeLists.txt
--- a/paddle/fluid/operators/rank_loss_op.cu
+++ b/paddle/fluid/operators/rank_loss_op.cu
--- a/paddle/fluid/operators/minus_op.cu
+++ b/paddle/fluid/operators/minus_op.cu
--- a/paddle/fluid/operators/eigen/eigen_function.h
+++ b/paddle/fluid/operators/eigen/eigen_function.h
--- a/paddle/fluid/operators/eigen/elementwise.cc
+++ b/paddle/fluid/operators/eigen/elementwise.cc
--- a/paddle/fluid/operators/eigen/elementwise.cu
+++ b/paddle/fluid/operators/eigen/elementwise.cu
--- a/paddle/fluid/operators/eigen/erf.cc
+++ b/paddle/fluid/operators/eigen/erf.cc
--- a/paddle/fluid/operators/eigen/erf.cu
+++ b/paddle/fluid/operators/eigen/erf.cu
--- a/paddle/fluid/operators/eigen/l1_norm.cc
+++ b/paddle/fluid/operators/eigen/l1_norm.cc
--- a/paddle/fluid/operators/eigen/l1_norm.cu
+++ b/paddle/fluid/operators/eigen/l1_norm.cu
--- a/paddle/fluid/operators/eigen/loss.cc
+++ b/paddle/fluid/operators/eigen/loss.cc
--- a/paddle/fluid/operators/eigen/loss.cu
+++ b/paddle/fluid/operators/eigen/loss.cu
--- a/paddle/fluid/operators/eigen/pad.cc
+++ b/paddle/fluid/operators/eigen/pad.cc
--- a/paddle/fluid/operators/eigen/pad.cu
+++ b/paddle/fluid/operators/eigen/pad.cu
--- a/paddle/fluid/operators/eigen/reverse.cc
+++ b/paddle/fluid/operators/eigen/reverse.cc
--- a/paddle/fluid/operators/eigen/reverse.cu
+++ b/paddle/fluid/operators/eigen/reverse.cu
--- a/paddle/fluid/operators/eigen/scale.cc
+++ b/paddle/fluid/operators/eigen/scale.cc
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
--- a/paddle/fluid/operators/erf_op.cu
+++ b/paddle/fluid/operators/erf_op.cu
--- a/paddle/fluid/operators/eigen/sign.cu
+++ b/paddle/fluid/operators/eigen/sign.cu
--- a/paddle/fluid/operators/eigen/slice.cc
+++ b/paddle/fluid/operators/eigen/slice.cc
--- a/paddle/fluid/operators/eigen/slice.cu
+++ b/paddle/fluid/operators/eigen/slice.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc
--- a/paddle/fluid/operators/erf_op.cc
+++ b/paddle/fluid/operators/erf_op.cc
--- a/paddle/fluid/operators/erf_op.h
+++ b/paddle/fluid/operators/erf_op.h
--- a/paddle/fluid/operators/expand_op_npu.cc
+++ b/paddle/fluid/operators/expand_op_npu.cc
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
--- a/paddle/fluid/operators/fill_constant_op_xpu.cc
+++ b/paddle/fluid/operators/fill_constant_op_xpu.cc
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
--- a/paddle/fluid/operators/gelu_op_npu.cc
+++ b/paddle/fluid/operators/gelu_op_npu.cc
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
--- a/paddle/fluid/operators/hinge_loss_op.h
+++ b/paddle/fluid/operators/hinge_loss_op.h
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
--- a/paddle/fluid/operators/increment_op.h
+++ b/paddle/fluid/operators/increment_op.h
--- a/paddle/fluid/operators/increment_op_npu.cc
+++ b/paddle/fluid/operators/increment_op_npu.cc
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
--- a/paddle/fluid/operators/kron_op.cu
+++ b/paddle/fluid/operators/kron_op.cu
--- a/paddle/fluid/operators/kron_op.h
+++ b/paddle/fluid/operators/kron_op.h
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
--- a/paddle/fluid/operators/l1_norm_op.h
+++ b/paddle/fluid/operators/l1_norm_op.h
--- a/paddle/fluid/operators/layer_norm_op_npu.cc
+++ b/paddle/fluid/operators/layer_norm_op_npu.cc
--- a/paddle/fluid/operators/lgamma_op.cc
+++ b/paddle/fluid/operators/lgamma_op.cc
--- a/paddle/fluid/operators/lgamma_op.cu
+++ b/paddle/fluid/operators/lgamma_op.cu
--- a/paddle/fluid/operators/lgamma_op.h
+++ b/paddle/fluid/operators/lgamma_op.h
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
--- a/paddle/fluid/operators/log_loss_op.h
+++ b/paddle/fluid/operators/log_loss_op.h
--- a/paddle/fluid/operators/log_softmax_op.cu
+++ b/paddle/fluid/operators/log_softmax_op.cu
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
--- a/paddle/fluid/operators/marker_op.cc
+++ b/paddle/fluid/operators/marker_op.cc
--- a/paddle/fluid/operators/marker_op.cu
+++ b/paddle/fluid/operators/marker_op.cu
--- a/paddle/fluid/operators/masked_select_op.cc
+++ b/paddle/fluid/operators/masked_select_op.cc
--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
--- a/paddle/fluid/operators/math/complex_functors.h
+++ b/paddle/fluid/operators/math/complex_functors.h
--- a/paddle/fluid/operators/math/concat_and_split.h
+++ b/paddle/fluid/operators/math/concat_and_split.h
--- a/paddle/fluid/operators/math/concat_test.cc
+++ b/paddle/fluid/operators/math/concat_test.cc
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/fluid/operators/math/math_function_test.cc
--- a/paddle/fluid/operators/math/padding.h
+++ b/paddle/fluid/operators/math/padding.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/math/vol2col_test.cc
+++ b/paddle/fluid/operators/math/vol2col_test.cc
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
--- a/paddle/fluid/operators/matmul_v2_op.cu
+++ b/paddle/fluid/operators/matmul_v2_op.cu
--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
--- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
--- a/paddle/fluid/operators/mul_op_npu.cc
+++ b/paddle/fluid/operators/mul_op_npu.cc
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
--- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
--- a/paddle/fluid/operators/pscore/CMakeLists.txt
+++ b/paddle/fluid/operators/pscore/CMakeLists.txt
--- a/paddle/fluid/operators/py_layer_op.cc
+++ b/paddle/fluid/operators/py_layer_op.cc
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
--- a/paddle/fluid/operators/rank_loss_op.h
+++ b/paddle/fluid/operators/rank_loss_op.h
--- a/paddle/fluid/operators/reduce_ops/cub_reduce.h
+++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/reverse_op.cc
+++ b/paddle/fluid/operators/reverse_op.cc
--- a/paddle/fluid/operators/reverse_op.cu
+++ b/paddle/fluid/operators/reverse_op.cu
--- a/paddle/fluid/operators/reverse_op.h
+++ b/paddle/fluid/operators/reverse_op.h
--- a/paddle/fluid/operators/run_program_op.cc
+++ b/paddle/fluid/operators/run_program_op.cc
--- a/paddle/fluid/operators/run_program_op.h
+++ b/paddle/fluid/operators/run_program_op.h
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
--- a/paddle/fluid/operators/scatter_op_npu.cc
+++ b/paddle/fluid/operators/scatter_op_npu.cc
--- a/paddle/fluid/operators/scatter_test.cc
+++ b/paddle/fluid/operators/scatter_test.cc
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
--- a/paddle/fluid/operators/slice_op_npu.cc
+++ b/paddle/fluid/operators/slice_op_npu.cc
--- a/paddle/fluid/operators/softmax_op_npu.cc
+++ b/paddle/fluid/operators/softmax_op_npu.cc
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
--- a/paddle/fluid/operators/stack_op_npu.cc
+++ b/paddle/fluid/operators/stack_op_npu.cc
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
--- a/paddle/fluid/operators/strided_slice_op.cu
+++ b/paddle/fluid/operators/strided_slice_op.cu
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
--- a/paddle/fluid/operators/top_k_op_npu.cc
+++ b/paddle/fluid/operators/top_k_op_npu.cc
--- a/paddle/fluid/operators/trace_op.cc
+++ b/paddle/fluid/operators/trace_op.cc
--- a/paddle/fluid/operators/trace_op.cu
+++ b/paddle/fluid/operators/trace_op.cu
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
--- a/paddle/fluid/operators/transpose_op.cu
+++ b/paddle/fluid/operators/transpose_op.cu
--- a/paddle/fluid/operators/transpose_op_npu.cc
+++ b/paddle/fluid/operators/transpose_op_npu.cc
--- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
--- a/paddle/fluid/platform/complex128.h
+++ b/paddle/fluid/platform/complex128.h
--- a/paddle/fluid/platform/complex64.h
+++ b/paddle/fluid/platform/complex64.h
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
--- a/paddle/fluid/platform/cudnn_desc.h
+++ b/paddle/fluid/platform/cudnn_desc.h
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
--- a/paddle/fluid/platform/dynload/miopen.h
+++ b/paddle/fluid/platform/dynload/miopen.h
--- a/paddle/fluid/platform/eigen_ext.h
+++ b/paddle/fluid/platform/eigen_ext.h
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
--- a/paddle/fluid/platform/cuda_error.proto
+++ b/paddle/fluid/platform/cuda_error.proto
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
--- a/paddle/fluid/pybind/fleet_py.cc
+++ b/paddle/fluid/pybind/fleet_py.cc
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
--- a/python/paddle/distributed/fleet/base/fleet_base.py
+++ b/python/paddle/distributed/fleet/base/fleet_base.py
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
--- a/python/paddle/distributed/fleet/runtime/the_one_ps.py
+++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/utils/recompute.py
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
--- a/python/paddle/fluid/contrib/layers/nn.py
+++ b/python/paddle/fluid/contrib/layers/nn.py
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
+++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_raw_program_optimizer.py
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_grad.py
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_layers.py
--- a/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
+++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_mp_model.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_conv_pass.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_nd_op.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_sum_op.py
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_bf16_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamax_api.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_api.py
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_api.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op_depthwise_conv.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_infer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_raw_program_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_sparse_embedding_ctr.py
--- a/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_recompute.py
--- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
+++ b/python/paddle/fluid/tests/unittests/test_grid_sampler_op.py
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
--- a/python/paddle/fluid/tests/unittests/test_lambv2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lambv2_op.py
--- a/python/paddle/fluid/tests/unittests/test_lgamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
--- a/python/paddle/fluid/tests/unittests/test_marker_op.py
+++ b/python/paddle/fluid/tests/unittests/test_marker_op.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_run_program_op.py
+++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
+++ b/python/paddle/fluid/tests/unittests/test_spawn_and_init_parallel_env.py
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
--- a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
--- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
+++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py
--- a/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_assign_op_xpu.py
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
--- a/python/paddle/optimizer/lamb.py
+++ b/python/paddle/optimizer/lamb.py
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/tools/CrossStackProfiler/CspChromeTraceFormatter.py
+++ b/tools/CrossStackProfiler/CspChromeTraceFormatter.py
--- a/tools/CrossStackProfiler/CspFileReader.py
+++ b/tools/CrossStackProfiler/CspFileReader.py
--- a/tools/CrossStackProfiler/CspReporter.py
+++ b/tools/CrossStackProfiler/CspReporter.py
--- a/tools/CrossStackProfiler/DCGMFileReader.py
+++ b/tools/CrossStackProfiler/DCGMFileReader.py
--- a/tools/CrossStackProfiler/NetFileReader.py
+++ b/tools/CrossStackProfiler/NetFileReader.py
--- a/tools/CrossStackProfiler/ProfileFileReader.py
+++ b/tools/CrossStackProfiler/ProfileFileReader.py
--- a/tools/CrossStackProfiler/__init__.py
+++ b/tools/CrossStackProfiler/__init__.py
--- a/tools/cudaError/README.md
+++ b/tools/cudaError/README.md
--- a/tools/cudaError/spider.py
+++ b/tools/cudaError/spider.py
--- a/tools/externalError/README.md
+++ b/tools/externalError/README.md
--- a/tools/externalError/spider.py
+++ b/tools/externalError/spider.py
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py